Spaces:

gokaygokay
/

Hunyuan3D-1.0

Running on Zero

App Files Files Community

gokaygokay commited on about 10 hours ago

Commit

0a88b62

•

1 Parent(s): 23ce364

Upload 93 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +37 -35
README.md +4 -7
app.py +284 -0
assets/logo.png +0 -0
assets/overview_3.png +0 -0
assets/radar.png +0 -0
assets/runtime.png +0 -0
assets/teaser.png +3 -0
demos/example_000.png +0 -0
demos/example_001.png +0 -0
demos/example_002.png +0 -0
demos/example_003.png +3 -0
demos/example_list.txt +2 -0
infer/__init__.py +28 -0
infer/gif_render.py +55 -0
infer/image_to_views.py +81 -0
infer/rembg.py +26 -0
infer/text_to_image.py +80 -0
infer/utils.py +77 -0
infer/views_to_mesh.py +94 -0
mvd/__init__.py +0 -0
mvd/hunyuan3d_mvd_lite_pipeline.py +493 -0
mvd/hunyuan3d_mvd_std_pipeline.py +471 -0
mvd/utils.py +85 -0
requirements.txt +22 -0
scripts/image_to_3d.sh +8 -0
scripts/image_to_3d_demo.sh +8 -0
scripts/image_to_3d_fast.sh +6 -0
scripts/image_to_3d_fast_demo.sh +6 -0
scripts/text_to_3d.sh +7 -0
scripts/text_to_3d_demo.sh +7 -0
scripts/text_to_3d_fast.sh +6 -0
scripts/text_to_3d_fast_demo.sh +6 -0
svrm/.DS_Store +0 -0
svrm/configs/2024-10-24T22-36-18-project.yaml +32 -0
svrm/configs/svrm.yaml +32 -0
svrm/ldm/.DS_Store +0 -0
svrm/ldm/models/svrm.py +263 -0
svrm/ldm/modules/attention.py +457 -0
svrm/ldm/modules/encoders/__init__.py +0 -0
svrm/ldm/modules/encoders/dinov2/__init__.py +0 -0
svrm/ldm/modules/encoders/dinov2/hub/__init__.py +0 -0
svrm/ldm/modules/encoders/dinov2/hub/backbones.py +156 -0
svrm/ldm/modules/encoders/dinov2/hub/utils.py +39 -0
svrm/ldm/modules/encoders/dinov2/layers/__init__.py +11 -0
svrm/ldm/modules/encoders/dinov2/layers/attention.py +89 -0
svrm/ldm/modules/encoders/dinov2/layers/block.py +269 -0
svrm/ldm/modules/encoders/dinov2/layers/dino_head.py +58 -0
svrm/ldm/modules/encoders/dinov2/layers/drop_path.py +34 -0
svrm/ldm/modules/encoders/dinov2/layers/layer_scale.py +27 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,37 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/teaser.png filter=lfs diff=lfs merge=lfs -text
+demos/example_003.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,14 +1,11 @@
 ---
-title: InstantIR
-emoji: 🖼
 colorFrom: purple
 colorTo: red
 sdk: gradio
 sdk_version: 4.42.0
 app_file: app.py
 pinned: false
-license: apache-2.0
-short_description: diffusion-based Image Restoration model
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Hunyuan3D-1.0
+emoji: 😻
 colorFrom: purple
 colorTo: red
 sdk: gradio
 sdk_version: 4.42.0
 app_file: app.py
 pinned: false
+short_description: Text-to-3D and Image-to-3D Generation
+---

app.py ADDED Viewed

	@@ -0,0 +1,284 @@

+import os
+import warnings
+from huggingface_hub import hf_hub_download
+import gradio as gr
+from glob import glob
+import shutil
+import torch
+import numpy as np
+from PIL import Image
+from einops import rearrange
+import argparse
+# Suppress warnings
+warnings.simplefilter('ignore', category=UserWarning)
+warnings.simplefilter('ignore', category=FutureWarning)
+warnings.simplefilter('ignore', category=DeprecationWarning)
+def download_models():
+    # Create weights directory if it doesn't exist
+    os.makedirs("weights", exist_ok=True)
+    os.makedirs("weights/hunyuanDiT", exist_ok=True)
+    # Download Hunyuan3D-1 model
+    try:
+        hf_hub_download(
+            repo_id="tencent/Hunyuan3D-1",
+            local_dir="./weights",
+            resume_download=True
+        )
+        print("Successfully downloaded Hunyuan3D-1 model")
+    except Exception as e:
+        print(f"Error downloading Hunyuan3D-1: {e}")
+    # Download HunyuanDiT model
+    try:
+        hf_hub_download(
+            repo_id="Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers-Distilled",
+            local_dir="./weights/hunyuanDiT",
+            resume_download=True
+        )
+        print("Successfully downloaded HunyuanDiT model")
+    except Exception as e:
+        print(f"Error downloading HunyuanDiT: {e}")
+# Download models before starting the app
+download_models()
+# Parse arguments
+parser = argparse.ArgumentParser()
+parser.add_argument("--use_lite", default=False, action="store_true")
+parser.add_argument("--mv23d_cfg_path", default="./svrm/configs/svrm.yaml", type=str)
+parser.add_argument("--mv23d_ckt_path", default="weights/svrm/svrm.safetensors", type=str)
+parser.add_argument("--text2image_path", default="weights/hunyuanDiT", type=str)
+parser.add_argument("--save_memory", default=False, action="store_true")
+parser.add_argument("--device", default="cuda:0", type=str)
+args = parser.parse_args()
+# Constants
+CONST_PORT = 8080
+CONST_MAX_QUEUE = 1
+CONST_SERVER = '0.0.0.0'
+CONST_HEADER = '''
+<h2><b>Official 🤗 Gradio Demo</b></h2>
+<h2><a href='https://github.com/tencent/Hunyuan3D-1' target='_blank'>
+<b>Hunyuan3D-1.0: A Unified Framework for Text-to-3D and Image-to-3D Generation</b></a></h2>
+'''
+# Helper functions
+def get_example_img_list():
+    print('Loading example img list ...')
+    return sorted(glob('./demos/example_*.png'))
+def get_example_txt_list():
+    print('Loading example txt list ...')
+    txt_list = []
+    for line in open('./demos/example_list.txt'):
+        txt_list.append(line.strip())
+    return txt_list
+example_is = get_example_img_list()
+example_ts = get_example_txt_list()
+# Import required workers
+from infer import seed_everything, save_gif
+from infer import Text2Image, Removebg, Image2Views, Views2Mesh, GifRenderer
+# Initialize workers
+worker_xbg = Removebg()
+print(f"loading {args.text2image_path}")
+worker_t2i = Text2Image(
+    pretrain=args.text2image_path,
+    device=args.device,
+    save_memory=args.save_memory
+)
+worker_i2v = Image2Views(
+    use_lite=args.use_lite,
+    device=args.device
+)
+worker_v23 = Views2Mesh(
+    args.mv23d_cfg_path,
+    args.mv23d_ckt_path,
+    use_lite=args.use_lite,
+    device=args.device
+)
+worker_gif = GifRenderer(args.device)
+# Pipeline stages
+def stage_0_t2i(text, image, seed, step):
+    os.makedirs('./outputs/app_output', exist_ok=True)
+    exists = set(int(_) for _ in os.listdir('./outputs/app_output') if not _.startswith("."))
+    cur_id = min(set(range(30)) - exists) if len(exists) < 30 else 0
+    if os.path.exists(f"./outputs/app_output/{(cur_id + 1) % 30}"):
+        shutil.rmtree(f"./outputs/app_output/{(cur_id + 1) % 30}")
+    save_folder = f'./outputs/app_output/{cur_id}'
+    os.makedirs(save_folder, exist_ok=True)
+    dst = save_folder + '/img.png'
+    if not text:
+        if image is None:
+            return dst, save_folder
+        image.save(dst)
+        return dst, save_folder
+    image = worker_t2i(text, seed, step)
+    image.save(dst)
+    dst = worker_xbg(image, save_folder)
+    return dst, save_folder
+def stage_1_xbg(image, save_folder):
+    if isinstance(image, str):
+        image = Image.open(image)
+    dst = save_folder + '/img_nobg.png'
+    rgba = worker_xbg(image)
+    rgba.save(dst)
+    return dst
+def stage_2_i2v(image, seed, step, save_folder):
+    if isinstance(image, str):
+        image = Image.open(image)
+    gif_dst = save_folder + '/views.gif'
+    res_img, pils = worker_i2v(image, seed, step)
+    save_gif(pils, gif_dst)
+    views_img, cond_img = res_img[0], res_img[1]
+    img_array = np.asarray(views_img, dtype=np.uint8)
+    show_img = rearrange(img_array, '(n h) (m w) c -> (n m) h w c', n=3, m=2)
+    show_img = show_img[worker_i2v.order, ...]
+    show_img = rearrange(show_img, '(n m) h w c -> (n h) (m w) c', n=2, m=3)
+    show_img = Image.fromarray(show_img)
+    return views_img, cond_img, show_img
+def stage_3_v23(views_pil, cond_pil, seed, save_folder, target_face_count=30000,
+                do_texture_mapping=True, do_render=True):
+    do_texture_mapping = do_texture_mapping or do_render
+    obj_dst = save_folder + '/mesh_with_colors.obj'
+    glb_dst = save_folder + '/mesh.glb'
+    worker_v23(
+        views_pil,
+        cond_pil,
+        seed=seed,
+        save_folder=save_folder,
+        target_face_count=target_face_count,
+        do_texture_mapping=do_texture_mapping
+    )
+    return obj_dst, glb_dst
+def stage_4_gif(obj_dst, save_folder, do_render_gif=True):
+    if not do_render_gif:
+        return None
+    gif_dst = save_folder + '/output.gif'
+    worker_gif(
+        save_folder + '/mesh.obj',
+        gif_dst_path=gif_dst
+    )
+    return gif_dst
+# Gradio Interface
+with gr.Blocks() as demo:
+    gr.Markdown(CONST_HEADER)
+    with gr.Row(variant="panel"):
+        with gr.Column(scale=2):
+            with gr.Tab("Text to 3D"):
+                with gr.Column():
+                    text = gr.TextArea('一只黑白相间的熊猫在白色背景上居中坐着，呈现出卡通风格和可爱氛围。',
+                                     lines=1, max_lines=10, label='Input text')
+                    with gr.Row():
+                        textgen_seed = gr.Number(value=0, label="T2I seed", precision=0)
+                        textgen_step = gr.Number(value=25, label="T2I step", precision=0)
+                        textgen_SEED = gr.Number(value=0, label="Gen seed", precision=0)
+                        textgen_STEP = gr.Number(value=50, label="Gen step", precision=0)
+                        textgen_max_faces = gr.Number(value=90000, label="max number of faces", precision=0)
+                    with gr.Row():
+                        textgen_do_texture_mapping = gr.Checkbox(label="texture mapping", value=False)
+                        textgen_do_render_gif = gr.Checkbox(label="Render gif", value=False)
+                        textgen_submit = gr.Button("Generate", variant="primary")
+                    gr.Examples(examples=example_ts, inputs=[text], label="Txt examples")
+            with gr.Tab("Image to 3D"):
+                with gr.Column():
+                    input_image = gr.Image(label="Input image", width=256, height=256,
+                                         type="pil", image_mode="RGBA", sources="upload")
+                    with gr.Row():
+                        imggen_SEED = gr.Number(value=0, label="Gen seed", precision=0)
+                        imggen_STEP = gr.Number(value=50, label="Gen step", precision=0)
+                        imggen_max_faces = gr.Number(value=90000, label="max number of faces", precision=0)
+                    with gr.Row():
+                        imggen_do_texture_mapping = gr.Checkbox(label="texture mapping", value=False)
+                        imggen_do_render_gif = gr.Checkbox(label="Render gif", value=False)
+                        imggen_submit = gr.Button("Generate", variant="primary")
+                    gr.Examples(examples=example_is, inputs=[input_image], label="Img examples")
+        with gr.Column(scale=3):
+            with gr.Tab("rembg image"):
+                rem_bg_image = gr.Image(label="No background image", width=256, height=256,
+                                      type="pil", image_mode="RGBA")
+            with gr.Tab("Multi views"):
+                result_image = gr.Image(label="Multi views", type="pil")
+            with gr.Tab("Obj"):
+                result_3dobj = gr.Model3D(label="Output obj")
+            with gr.Tab("Glb"):
+                result_3dglb = gr.Model3D(label="Output glb")
+            with gr.Tab("GIF"):
+                result_gif = gr.Image(label="Rendered GIF")
+    # States
+    none = gr.State(None)
+    save_folder = gr.State()
+    cond_image = gr.State()
+    views_image = gr.State()
+    text_image = gr.State()
+    # Event handlers
+    textgen_submit.click(
+        fn=stage_0_t2i,
+        inputs=[text, none, textgen_seed, textgen_step],
+        outputs=[rem_bg_image, save_folder],
+    ).success(
+        fn=stage_2_i2v,
+        inputs=[rem_bg_image, textgen_SEED, textgen_STEP, save_folder],
+        outputs=[views_image, cond_image, result_image],
+    ).success(
+        fn=stage_3_v23,
+        inputs=[views_image, cond_image, textgen_SEED, save_folder, textgen_max_faces,
+                textgen_do_texture_mapping, textgen_do_render_gif],
+        outputs=[result_3dobj, result_3dglb],
+    ).success(
+        fn=stage_4_gif,
+        inputs=[result_3dglb, save_folder, textgen_do_render_gif],
+        outputs=[result_gif],
+    )
+    imggen_submit.click(
+        fn=stage_0_t2i,
+        inputs=[none, input_image, textgen_seed, textgen_step],
+        outputs=[text_image, save_folder],
+    ).success(
+        fn=stage_1_xbg,
+        inputs=[text_image, save_folder],
+        outputs=[rem_bg_image],
+    ).success(
+        fn=stage_2_i2v,
+        inputs=[rem_bg_image, imggen_SEED, imggen_STEP, save_folder],
+        outputs=[views_image, cond_image, result_image],
+    ).success(
+        fn=stage_3_v23,
+        inputs=[views_image, cond_image, imggen_SEED, save_folder, imggen_max_faces,
+                imggen_do_texture_mapping, imggen_do_render_gif],
+        outputs=[result_3dobj, result_3dglb],
+    ).success(
+        fn=stage_4_gif,
+        inputs=[result_3dglb, save_folder, imggen_do_render_gif],
+        outputs=[result_gif],
+    )
+    demo.queue(max_size=CONST_MAX_QUEUE)
+    demo.launch(server_name=CONST_SERVER, server_port=CONST_PORT)

assets/logo.png ADDED Viewed

assets/overview_3.png ADDED Viewed

assets/radar.png ADDED Viewed

assets/runtime.png ADDED Viewed

assets/teaser.png ADDED Viewed

Git LFS Details

SHA256: af24eeebe39864d377b7ef8e11521a8b7cba964c14032cc28bd0d95bd5219c00
Pointer size: 132 Bytes
Size of remote file: 3.1 MB

demos/example_000.png ADDED Viewed

demos/example_001.png ADDED Viewed

demos/example_002.png ADDED Viewed

demos/example_003.png ADDED Viewed

Git LFS Details

SHA256: d947e0ef10baf761abb78d2842519ae7428bc6eadab26a159510ddcaf2a47e67
Pointer size: 132 Bytes
Size of remote file: 1.07 MB

demos/example_list.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ a pot of green plants grows in a red flower pot.
2	+ a lovely rabbit eating carrots

infer/__init__.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Open Source Model Licensed under the Apache License Version 2.0 and Other Licenses of the Third-Party Components therein:
+# The below Model in this distribution may have been modified by THL A29 Limited ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
+# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
+# The below software and/or models in this distribution may have been
+# modified by THL A29 Limited ("Tencent Modifications").
+# All Tencent Modifications are Copyright (C) THL A29 Limited.
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+from .utils import seed_everything, timing_decorator, auto_amp_inference
+from .rembg import Removebg
+from .text_to_image import Text2Image
+from .image_to_views import Image2Views, save_gif
+from .views_to_mesh import Views2Mesh
+from .gif_render import GifRenderer

infer/gif_render.py ADDED Viewed

	@@ -0,0 +1,55 @@

+# Open Source Model Licensed under the Apache License Version 2.0 and Other Licenses of the Third-Party Components therein:
+# The below Model in this distribution may have been modified by THL A29 Limited ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
+# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
+# The below software and/or models in this distribution may have been
+# modified by THL A29 Limited ("Tencent Modifications").
+# All Tencent Modifications are Copyright (C) THL A29 Limited.
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+from svrm.ldm.vis_util import render
+from .utils import seed_everything, timing_decorator
+class GifRenderer():
+    '''
+        render frame(s) of mesh using pytorch3d
+    '''
+    def __init__(self, device="cuda:0"):
+        self.device = device
+    @timing_decorator("gif render")
+    def __call__(
+        self,
+        obj_filename,
+        elev=0,
+        azim=0,
+        resolution=512,
+        gif_dst_path='',
+        n_views=120,
+        fps=30,
+        rgb=True
+    ):
+        render(
+            obj_filename,
+            elev=elev,
+            azim=azim,
+            resolution=resolution,
+            gif_dst_path=gif_dst_path,
+            n_views=n_views,
+            fps=fps,
+            device=self.device,
+            rgb=rgb
+        )

infer/image_to_views.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# Open Source Model Licensed under the Apache License Version 2.0 and Other Licenses of the Third-Party Components therein:
+# The below Model in this distribution may have been modified by THL A29 Limited ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
+# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
+# The below software and/or models in this distribution may have been
+# modified by THL A29 Limited ("Tencent Modifications").
+# All Tencent Modifications are Copyright (C) THL A29 Limited.
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+import os
+import time
+import torch
+import random
+import numpy as np
+from PIL import Image
+from einops import rearrange
+from PIL import Image, ImageSequence
+from .utils import seed_everything, timing_decorator, auto_amp_inference
+from .utils import get_parameter_number, set_parameter_grad_false
+from mvd.hunyuan3d_mvd_std_pipeline import HunYuan3D_MVD_Std_Pipeline
+from mvd.hunyuan3d_mvd_lite_pipeline import Hunyuan3d_MVD_Lite_Pipeline
+def save_gif(pils, save_path, df=False):
+    # save a list of PIL.Image to gif
+    spf = 4000 / len(pils)
+    os.makedirs(os.path.dirname(save_path), exist_ok=True)
+    pils[0].save(save_path, format="GIF", save_all=True, append_images=pils[1:], duration=spf, loop=0)
+    return save_path
+class Image2Views():
+    def __init__(self, device="cuda:0", use_lite=False):
+        self.device = device
+        if use_lite:
+            self.pipe = Hunyuan3d_MVD_Lite_Pipeline.from_pretrained(
+                "./weights/mvd_lite",
+                torch_dtype = torch.float16,
+                use_safetensors = True,
+            )
+        else:
+            self.pipe = HunYuan3D_MVD_Std_Pipeline.from_pretrained(
+                "./weights/mvd_std",
+                torch_dtype = torch.float16,
+                use_safetensors = True,
+            )
+        self.pipe = self.pipe.to(device)
+        self.order = [0, 1, 2, 3, 4, 5] if use_lite else [0, 2, 4, 5, 3, 1]
+        set_parameter_grad_false(self.pipe.unet)
+        print('image2views unet model', get_parameter_number(self.pipe.unet))
+    @torch.no_grad()
+    @timing_decorator("image to views")
+    @auto_amp_inference
+    def __call__(self, pil_img, seed=0, steps=50, guidance_scale=2.0, guidance_curve=lambda t:2.0):
+        seed_everything(seed)
+        generator = torch.Generator(device=self.device)
+        res_img = self.pipe(pil_img,
+                            num_inference_steps=steps,
+                            guidance_scale=guidance_scale,
+                            guidance_curve=guidance_curve,
+                            generat=generator).images
+        show_image = rearrange(np.asarray(res_img[0], dtype=np.uint8), '(n h) (m w) c -> (n m) h w c', n=3, m=2)
+        pils = [res_img[1]]+[Image.fromarray(show_image[idx]) for idx in self.order]
+        torch.cuda.empty_cache()
+        return res_img, pils

infer/rembg.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from rembg import remove, new_session
+from .utils import timing_decorator
+class Removebg():
+    def __init__(self, name="u2net"):
+        '''
+            name: rembg
+        '''
+        self.session = new_session(name)
+    @timing_decorator("remove background")
+    def __call__(self, rgb_img, force=False):
+        '''
+            inputs:
+                rgb_img: PIL.Image, with RGB mode expected
+                force: bool, input is RGBA mode
+            return:
+                rgba_img: PIL.Image with RGBA mode
+        '''
+        if rgb_img.mode == "RGBA":
+            if force:
+                rgb_img = rgb_img.convert("RGB")
+            else:
+                return rgb_img
+        rgba_img = remove(rgb_img, session=self.session)
+        return rgba_img

infer/text_to_image.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# Open Source Model Licensed under the Apache License Version 2.0 and Other Licenses of the Third-Party Components therein:
+# The below Model in this distribution may have been modified by THL A29 Limited ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
+# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
+# The below software and/or models in this distribution may have been
+# modified by THL A29 Limited ("Tencent Modifications").
+# All Tencent Modifications are Copyright (C) THL A29 Limited.
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+import torch
+from .utils import seed_everything, timing_decorator, auto_amp_inference
+from .utils import get_parameter_number, set_parameter_grad_false
+from diffusers import HunyuanDiTPipeline, AutoPipelineForText2Image
+class Text2Image():
+    def __init__(self, pretrain="weights/hunyuanDiT", device="cuda:0", save_memory=False):
+        '''
+            save_memory: if GPU memory is low, can set it
+        '''
+        self.save_memory = save_memory
+        self.device = device
+        self.pipe = AutoPipelineForText2Image.from_pretrained(
+            pretrain,
+            torch_dtype = torch.float16,
+            enable_pag = True,
+            pag_applied_layers = ["blocks.(16|17|18|19)"]
+        )
+        set_parameter_grad_false(self.pipe.transformer)
+        print('text2image transformer model', get_parameter_number(self.pipe.transformer))
+        if not save_memory:
+            self.pipe = self.pipe.to(device)
+        self.neg_txt = "文本,特写,裁剪,出框,最差质量,低质量,JPEG伪影,PGLY,重复,病态,残缺,多余的手指,变异的手," \
+                       "画得不好的手,画得不好的脸,变异,畸形,模糊,脱水,糟糕的解剖学,糟糕的比例,多余的肢体,克隆的脸," \
+                       "毁容,恶心的比例,畸形的肢体,缺失的手臂,缺失的腿,额外的手臂,额外的腿,融合的手指,手指太多,长脖子"
+    @torch.no_grad()
+    @timing_decorator('text to image')
+    @auto_amp_inference
+    def __call__(self, *args, **kwargs):
+        if self.save_memory:
+            self.pipe = self.pipe.to(self.device)
+            torch.cuda.empty_cache()
+            res = self.call(*args, **kwargs)
+            self.pipe = self.pipe.to("cpu")
+        else:
+            res = self.call(*args, **kwargs)
+        torch.cuda.empty_cache()
+        return res
+    def call(self, prompt, seed=0, steps=25):
+        '''
+            inputs:
+                prompr: str
+                seed: int
+                steps: int
+            return:
+                rgb: PIL.Image
+        '''
+        prompt = prompt + ",白色背景,3D风格,最佳质量"
+        seed_everything(seed)
+        generator = torch.Generator(device=self.device)
+        if seed is not None: generator = generator.manual_seed(int(seed))
+        rgb = self.pipe(prompt=prompt, negative_prompt=self.neg_txt, num_inference_steps=steps,
+            pag_scale=1.3, width=1024, height=1024, generator=generator, return_dict=False)[0][0]
+        torch.cuda.empty_cache()
+        return rgb

infer/utils.py ADDED Viewed

	@@ -0,0 +1,77 @@

+# Open Source Model Licensed under the Apache License Version 2.0 and Other Licenses of the Third-Party Components therein:
+# The below Model in this distribution may have been modified by THL A29 Limited ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
+# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
+# The below software and/or models in this distribution may have been
+# modified by THL A29 Limited ("Tencent Modifications").
+# All Tencent Modifications are Copyright (C) THL A29 Limited.
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+import os
+import time
+import random
+import numpy as np
+import torch
+from torch.cuda.amp import autocast, GradScaler
+from functools import wraps
+def seed_everything(seed):
+    '''
+        seed everthing
+    '''
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    os.environ["PL_GLOBAL_SEED"] = str(seed)
+def timing_decorator(category: str):
+    '''
+        timing_decorator: record time
+    '''
+    def decorator(func):
+        func.call_count = 0
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            start_time = time.time()
+            result = func(*args, **kwargs)
+            end_time = time.time()
+            elapsed_time = end_time - start_time
+            func.call_count += 1
+            print(f"[HunYuan3D]-[{category}], cost time: {elapsed_time:.4f}s") # huiwen
+            return result
+        return wrapper
+    return decorator
+def auto_amp_inference(func):
+    '''
+        with torch.cuda.amp.autocast()"
+            xxx
+    '''
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        with autocast():
+            output = func(*args, **kwargs)
+        return output
+    return wrapper
+def get_parameter_number(model):
+    total_num = sum(p.numel() for p in model.parameters())
+    trainable_num = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    return {'Total': total_num, 'Trainable': trainable_num}
+def set_parameter_grad_false(model):
+    for p in model.parameters():
+        p.requires_grad = False

infer/views_to_mesh.py ADDED Viewed

	@@ -0,0 +1,94 @@

+# Open Source Model Licensed under the Apache License Version 2.0 and Other Licenses of the Third-Party Components therein:
+# The below Model in this distribution may have been modified by THL A29 Limited ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
+# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
+# The below software and/or models in this distribution may have been
+# modified by THL A29 Limited ("Tencent Modifications").
+# All Tencent Modifications are Copyright (C) THL A29 Limited.
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+import os
+import time
+import torch
+import random
+import numpy as np
+from PIL import Image
+from einops import rearrange
+from PIL import Image, ImageSequence
+from .utils import seed_everything, timing_decorator, auto_amp_inference
+from .utils import get_parameter_number, set_parameter_grad_false
+from svrm.predictor import MV23DPredictor
+class Views2Mesh():
+    def __init__(self, mv23d_cfg_path, mv23d_ckt_path, device="cuda:0", use_lite=False):
+        '''
+            mv23d_cfg_path: config yaml file
+            mv23d_ckt_path: path to ckpt
+            use_lite:
+        '''
+        self.mv23d_predictor = MV23DPredictor(mv23d_ckt_path, mv23d_cfg_path, device=device)
+        self.mv23d_predictor.model.eval()
+        self.order = [0, 1, 2, 3, 4, 5] if use_lite else [0, 2, 4, 5, 3, 1]
+        set_parameter_grad_false(self.mv23d_predictor.model)
+        print('view2mesh model', get_parameter_number(self.mv23d_predictor.model))
+    @torch.no_grad()
+    @timing_decorator("views to mesh")
+    @auto_amp_inference
+    def __call__(
+        self,
+        views_pil=None,
+        cond_pil=None,
+        gif_pil=None,
+        seed=0,
+        target_face_count = 10000,
+        do_texture_mapping = True,
+        save_folder='./outputs/test'
+    ):
+        '''
+            can set views_pil, cond_pil simutaously or set gif_pil only
+            seed: int
+            target_face_count: int
+            save_folder: path to save mesh files
+        '''
+        save_dir = save_folder
+        os.makedirs(save_dir, exist_ok=True)
+        if views_pil is not None and cond_pil is not None:
+            show_image = rearrange(np.asarray(views_pil, dtype=np.uint8),
+                                   '(n h) (m w) c -> (n m) h w c', n=3, m=2)
+            views = [Image.fromarray(show_image[idx]) for idx in self.order]
+            image_list = [cond_pil]+ views
+            image_list = [img.convert('RGB') for img in image_list]
+        elif gif_pil is not None:
+            image_list = [img.convert('RGB') for img in ImageSequence.Iterator(gif_pil)]
+        image_input = image_list[0]
+        image_list = image_list[1:] + image_list[:1]
+        seed_everything(seed)
+        self.mv23d_predictor.predict(
+            image_list,
+            save_dir = save_dir,
+            image_input = image_input,
+            target_face_count = target_face_count,
+            do_texture_mapping = do_texture_mapping
+        )
+        torch.cuda.empty_cache()
+        return save_dir

mvd/__init__.py ADDED Viewed

File without changes

mvd/hunyuan3d_mvd_lite_pipeline.py ADDED Viewed

	@@ -0,0 +1,493 @@

+# Open Source Model Licensed under the Apache License Version 2.0 and Other Licenses of the Third-Party Components therein:
+# The below Model in this distribution may have been modified by THL A29 Limited ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
+# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
+# The below software and/or models in this distribution may have been
+# modified by THL A29 Limited ("Tencent Modifications").
+# All Tencent Modifications are Copyright (C) THL A29 Limited.
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+import math
+import numpy
+import torch
+import inspect
+import warnings
+from PIL import Image
+from einops import rearrange
+import torch.nn.functional as F
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.configuration_utils import FrozenDict
+from diffusers.image_processor import VaeImageProcessor
+from typing import Any, Callable, Dict, List, Optional, Union
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers import DDPMScheduler, EulerAncestralDiscreteScheduler, ImagePipelineOutput
+from diffusers.loaders import (
+    FromSingleFileMixin,
+    LoraLoaderMixin,
+    TextualInversionLoaderMixin
+)
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection
+)
+from diffusers.models.attention_processor import (
+    Attention,
+    AttnProcessor,
+    XFormersAttnProcessor,
+    AttnProcessor2_0
+)
+from .utils import to_rgb_image, white_out_background, recenter_img
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from here import Hunyuan3d_MVD_Qing_Pipeline
+        >>> pipe = Hunyuan3d_MVD_Qing_Pipeline.from_pretrained(
+        ...     "Tencent-Hunyuan-3D/MVD-Qing", torch_dtype=torch.float16
+        ... )
+        >>> pipe.to("cuda")
+        >>> img = Image.open("demo.png")
+        >>> res_img = pipe(img).images[0]
+"""
+def unscale_latents(latents): return latents / 0.75 + 0.22
+def unscale_image  (image  ): return   image / 0.50 * 0.80
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+class ReferenceOnlyAttnProc(torch.nn.Module):
+    # reference attention
+    def __init__(self, chained_proc, enabled=False, name=None):
+        super().__init__()
+        self.enabled = enabled
+        self.chained_proc = chained_proc
+        self.name = name
+    def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None, mode="w", ref_dict=None):
+        if encoder_hidden_states is None: encoder_hidden_states = hidden_states
+        if self.enabled:
+            if mode == 'w':
+                ref_dict[self.name] = encoder_hidden_states
+            elif mode == 'r':
+                encoder_hidden_states = torch.cat([encoder_hidden_states, ref_dict.pop(self.name)], dim=1)
+        res = self.chained_proc(attn, hidden_states, encoder_hidden_states, attention_mask)
+        return res
+# class RowWiseAttnProcessor2_0:
+#     def __call__(self, attn,
+#                  hidden_states,
+#                  encoder_hidden_states=None,
+#                  attention_mask=None,
+#                  temb=None,
+#                  num_views=6,
+#                  *args,
+#                  **kwargs):
+#         residual = hidden_states
+#         if attn.spatial_norm is not None: hidden_states = attn.spatial_norm(hidden_states, temb)
+#         input_ndim = hidden_states.ndim
+#         if input_ndim == 4:
+#             batch_size, channel, height, width = hidden_states.shape
+#             hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+#         if encoder_hidden_states is None:
+#             batch_size, sequence_length, _ = hidden_states.shape
+#         else:
+#             batch_size, sequence_length, _ = encoder_hidden_states.shape
+#         if attention_mask is not None:
+#             attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+#             attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+#         if attn.group_norm is not None: hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+#         query = attn.to_q(hidden_states)
+#         if encoder_hidden_states is None: encoder_hidden_states = hidden_states
+#         elif attn.norm_cross: encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+#         # encoder_hidden_states [B, 6hw+hw, C] if ref att
+#         key = attn.to_k(encoder_hidden_states) # [B, Vhw+hw, C]
+#         value = attn.to_v(encoder_hidden_states) # [B, Vhw+hw, C]
+#         mv_flag = hidden_states.shape[1] < encoder_hidden_states.shape[1] and encoder_hidden_states.shape[1] != 77
+#         if mv_flag:
+#             target_size = int(math.sqrt(hidden_states.shape[1] // num_views))
+#             assert target_size ** 2 * num_views == hidden_states.shape[1]
+#             gen_key = key[:, :num_views*target_size*target_size, :]
+#             ref_key = key[:, num_views*target_size*target_size:, :]
+#             gen_value = value[:, :num_views*target_size*target_size, :]
+#             ref_value = value[:, num_views*target_size*target_size:, :]
+#             # rowwise attention
+#             query, gen_key, gen_value = \
+#                     rearrange(    query, "b (v1 h v2 w) c -> (b h) (v1 v2 w) c",
+#                               v1=num_views//2, v2=2, h=target_size, w=target_size), \
+#                     rearrange(  gen_key, "b (v1 h v2 w) c -> (b h) (v1 v2 w) c",
+#                               v1=num_views//2, v2=2, h=target_size, w=target_size), \
+#                     rearrange(gen_value, "b (v1 h v2 w) c -> (b h) (v1 v2 w) c",
+#                               v1=num_views//2, v2=2, h=target_size, w=target_size)
+#             inner_dim = key.shape[-1]
+#             ref_size = int(math.sqrt(ref_key.shape[1]))
+#             ref_key_expanded = ref_key.view(batch_size, 1, ref_size * ref_size, inner_dim)
+#             ref_key_expanded = ref_key_expanded.expand(-1, target_size, -1, -1).contiguous()
+#             ref_key_expanded = ref_key_expanded.view(batch_size * target_size, ref_size * ref_size, inner_dim)
+#             key   = torch.cat([  gen_key,   ref_key_expanded], dim=1)
+#             ref_value_expanded = ref_value.view(batch_size, 1, ref_size * ref_size, inner_dim)
+#             ref_value_expanded = ref_value_expanded.expand(-1, target_size, -1, -1).contiguous()
+#             ref_value_expanded = ref_value_expanded.view(batch_size * target_size, ref_size * ref_size, inner_dim)
+#             value = torch.cat([gen_value, ref_value_expanded], dim=1)
+#             h = target_size
+#         else:
+#             target_size = int(math.sqrt(hidden_states.shape[1]))
+#             h = 1
+#             num_views = 1
+#         inner_dim = key.shape[-1]
+#         head_dim = inner_dim // attn.heads
+#         query = query.view(batch_size * h, -1, attn.heads, head_dim).transpose(1, 2)
+#         key   =   key.view(batch_size * h, -1, attn.heads, head_dim).transpose(1, 2)
+#         value = value.view(batch_size * h, -1, attn.heads, head_dim).transpose(1, 2)
+#         hidden_states = F.scaled_dot_product_attention(query, key, value,
+#                                                        attn_mask=attention_mask,
+#                                                        dropout_p=0.0,
+#                                                        is_causal=False)
+#         hidden_states = hidden_states.transpose(1, 2).reshape(batch_size * h,
+#                                                               -1,
+#                                                               attn.heads * head_dim).to(query.dtype)
+#         hidden_states = attn.to_out[1](attn.to_out[0](hidden_states))
+#         if mv_flag: hidden_states = rearrange(hidden_states, "(b h) (v1 v2 w) c -> b (v1 h v2 w) c",
+#                                               b=batch_size, v1=num_views//2,
+#                                               v2=2, h=target_size, w=target_size)
+#         if input_ndim == 4:
+#             hidden_states = hidden_states.transpose(-1, -2)
+#             hidden_states = hidden_states.reshape(batch_size,
+#                                                   channel,
+#                                                   target_size,
+#                                                   target_size)
+#         if attn.residual_connection: hidden_states = hidden_states + residual
+#         hidden_states = hidden_states / attn.rescale_output_factor
+#         return hidden_states
+class RefOnlyNoisedUNet(torch.nn.Module):
+    def __init__(self, unet, train_sched, val_sched):
+        super().__init__()
+        self.unet = unet
+        self.train_sched = train_sched
+        self.val_sched = val_sched
+        unet_lora_attn_procs = dict()
+        for name, _ in unet.attn_processors.items():
+            unet_lora_attn_procs[name] = ReferenceOnlyAttnProc(AttnProcessor2_0(),
+                                                           enabled=name.endswith("attn1.processor"),
+                                                           name=name)
+        unet.set_attn_processor(unet_lora_attn_procs)
+    def __getattr__(self, name: str):
+        try:
+            return super().__getattr__(name)
+        except AttributeError:
+            return getattr(self.unet, name)
+    def forward(self, sample, timestep, encoder_hidden_states, *args, cross_attention_kwargs, **kwargs):
+        cond_lat = cross_attention_kwargs['cond_lat']
+        noise = torch.randn_like(cond_lat)
+        if self.training:
+            noisy_cond_lat = self.train_sched.add_noise(cond_lat, noise, timestep)
+            noisy_cond_lat = self.train_sched.scale_model_input(noisy_cond_lat, timestep)
+        else:
+            noisy_cond_lat = self.val_sched.add_noise(cond_lat, noise, timestep.reshape(-1))
+            noisy_cond_lat = self.val_sched.scale_model_input(noisy_cond_lat, timestep.reshape(-1))
+        ref_dict = {}
+        self.unet(noisy_cond_lat,
+                  timestep,
+                  encoder_hidden_states,
+                  *args,
+                  cross_attention_kwargs=dict(mode="w", ref_dict=ref_dict),
+                  **kwargs)
+        return  self.unet(sample,
+                          timestep,
+                          encoder_hidden_states,
+                          *args,
+                          cross_attention_kwargs=dict(mode="r", ref_dict=ref_dict),
+                          **kwargs)
+class Hunyuan3d_MVD_Lite_Pipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin):
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        vision_encoder: CLIPVisionModelWithProjection,
+        feature_extractor_clip: CLIPImageProcessor,
+        feature_extractor_vae: CLIPImageProcessor,
+        ramping_coefficients: Optional[list] = None,
+        safety_checker=None,
+    ):
+        DiffusionPipeline.__init__(self)
+        self.register_modules(
+            vae=vae,
+            unet=unet,
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+            text_encoder=text_encoder,
+            vision_encoder=vision_encoder,
+            feature_extractor_vae=feature_extractor_vae,
+            feature_extractor_clip=feature_extractor_clip)
+        '''
+            rewrite the stable diffusion pipeline
+            vae: vae
+            unet: unet
+            tokenizer: tokenizer
+            scheduler: scheduler
+            text_encoder: text_encoder
+            vision_encoder: vision_encoder
+            feature_extractor_vae: feature_extractor_vae
+            feature_extractor_clip: feature_extractor_clip
+        '''
+        self.register_to_config(ramping_coefficients=ramping_coefficients)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+    def prepare_extra_step_kwargs(self, generator, eta):
+        extra_step_kwargs = {}
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_eta: extra_step_kwargs["eta"] = eta
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator: extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    @torch.no_grad()
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+    ):
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if prompt_embeds is None:
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+            prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)[0]
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None: uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt): raise TypeError()
+            elif isinstance(negative_prompt, str): uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt): raise ValueError()
+            else: uncond_tokens = negative_prompt
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(uncond_tokens,
+                                          padding="max_length",
+                                          max_length=max_length,
+                                          truncation=True,
+                                          return_tensors="pt")
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+            negative_prompt_embeds = self.text_encoder(uncond_input.input_ids.to(device), attention_mask=attention_mask)
+            negative_prompt_embeds = negative_prompt_embeds[0]
+        if do_classifier_free_guidance:
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+        return prompt_embeds
+    @torch.no_grad()
+    def encode_condition_image(self, image: torch.Tensor): return self.vae.encode(image).latent_dist.sample()
+    @torch.no_grad()
+    def __call__(self, image=None,
+                 width=640,
+                 height=960,
+                 num_inference_steps=75,
+                 return_dict=True,
+                 generator=None,
+                 **kwargs):
+        batch_size = 1
+        num_images_per_prompt = 1
+        output_type = 'pil'
+        do_classifier_free_guidance = True
+        guidance_rescale = 0.
+        if isinstance(self.unet, UNet2DConditionModel):
+            self.unet = RefOnlyNoisedUNet(self.unet, None, self.scheduler).eval()
+        cond_image = recenter_img(image)
+        cond_image = to_rgb_image(image)
+        image = cond_image
+        image_1 = self.feature_extractor_vae(images=image, return_tensors="pt").pixel_values
+        image_2 = self.feature_extractor_clip(images=image, return_tensors="pt").pixel_values
+        image_1 = image_1.to(device=self.vae.device, dtype=self.vae.dtype)
+        image_2 = image_2.to(device=self.vae.device, dtype=self.vae.dtype)
+        cond_lat = self.encode_condition_image(image_1)
+        negative_lat = self.encode_condition_image(torch.zeros_like(image_1))
+        cond_lat = torch.cat([negative_lat, cond_lat])
+        cross_attention_kwargs = dict(cond_lat=cond_lat)
+        global_embeds = self.vision_encoder(image_2, output_hidden_states=False).image_embeds.unsqueeze(-2)
+        encoder_hidden_states = self._encode_prompt('', self.device, num_images_per_prompt, False)
+        ramp = global_embeds.new_tensor(self.config.ramping_coefficients).unsqueeze(-1)
+        prompt_embeds = torch.cat([encoder_hidden_states, encoder_hidden_states + global_embeds * ramp])
+        device = self._execution_device
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(batch_size * num_images_per_prompt,
+                                       num_channels_latents,
+                                       height,
+                                       width,
+                                       prompt_embeds.dtype,
+                                       device,
+                                       generator,
+                                       None)
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, 0.0)
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        # set adaptive cfg
+        # the image order is:
+        #    [0, 60,
+        #     120, 180,
+        #     240, 300]
+        # the cfg is set as 3, 2.5, 2, 1.5
+        tmp_guidance_scale = torch.ones_like(latents)
+        tmp_guidance_scale[:, :, :40, :40] = 3
+        tmp_guidance_scale[:, :, :40, 40:] =  2.5
+        tmp_guidance_scale[:, :, 40:80, :40] =  2
+        tmp_guidance_scale[:, :, 40:80, 40:] =  1.5
+        tmp_guidance_scale[:, :, 80:120, :40] =  2
+        tmp_guidance_scale[:, :, 80:120, 40:] =  2.5
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                noise_pred = self.unet(latent_model_input, t,
+                                encoder_hidden_states=prompt_embeds,
+                                cross_attention_kwargs=cross_attention_kwargs,
+                                return_dict=False)[0]
+                adaptive_guidance_scale = (2 + 16 * (t / 1000) ** 5) / 3
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + \
+                        tmp_guidance_scale * adaptive_guidance_scale * \
+                        (noise_pred_text - noise_pred_uncond)
+                if do_classifier_free_guidance and guidance_rescale > 0.0:
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                if i==len(timesteps)-1 or ((i+1)>num_warmup_steps and (i+1)%self.scheduler.order==0):
+                    progress_bar.update()
+        latents = unscale_latents(latents)
+        image = unscale_image(self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0])
+        image = self.image_processor.postprocess(image, output_type='pil')[0]
+        image = [image, cond_image]
+        return ImagePipelineOutput(images=image) if return_dict else (image,)

mvd/hunyuan3d_mvd_std_pipeline.py ADDED Viewed

	@@ -0,0 +1,471 @@

+# Open Source Model Licensed under the Apache License Version 2.0 and Other Licenses of the Third-Party Components therein:
+# The below Model in this distribution may have been modified by THL A29 Limited ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
+# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
+# The below software and/or models in this distribution may have been
+# modified by THL A29 Limited ("Tencent Modifications").
+# All Tencent Modifications are Copyright (C) THL A29 Limited.
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+import inspect
+from typing import Any, Dict, Optional
+from typing import Any, Dict, List, Optional, Tuple, Union
+import os
+import torch
+import numpy as np
+from PIL import Image
+import diffusers
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.models.attention_processor import (
+    Attention,
+    AttnProcessor,
+    XFormersAttnProcessor,
+    AttnProcessor2_0
+)
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    DiffusionPipeline,
+    EulerAncestralDiscreteScheduler,
+    UNet2DConditionModel,
+    ImagePipelineOutput
+)
+import transformers
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+    CLIPTextModelWithProjection
+)
+from .utils import to_rgb_image, white_out_background, recenter_img
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import Hunyuan3d_MVD_XL_Pipeline
+        >>> pipe = Hunyuan3d_MVD_XL_Pipeline.from_pretrained(
+        ...     "Tencent-Hunyuan-3D/MVD-XL", torch_dtype=torch.float16
+        ... )
+        >>> pipe.to("cuda")
+        >>> img = Image.open("demo.png")
+        >>> res_img = pipe(img).images[0]
+        ```
+"""
+def scale_latents(latents):   return (latents - 0.22) * 0.75
+def unscale_latents(latents): return (latents / 0.75) + 0.22
+def scale_image(image):       return (image - 0.5) / 0.5
+def scale_image_2(image):     return (image * 0.5) / 0.8
+def unscale_image(image):     return (image * 0.5) + 0.5
+def unscale_image_2(image):   return (image * 0.8) / 0.5
+class ReferenceOnlyAttnProc(torch.nn.Module):
+    def __init__(self, chained_proc, enabled=False, name=None):
+        super().__init__()
+        self.enabled = enabled
+        self.chained_proc = chained_proc
+        self.name = name
+    def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None, mode="w", ref_dict=None):
+        encoder_hidden_states = hidden_states if encoder_hidden_states is None else encoder_hidden_states
+        if self.enabled:
+            if   mode == 'w': ref_dict[self.name]   = encoder_hidden_states
+            elif mode == 'r': encoder_hidden_states = torch.cat([encoder_hidden_states, ref_dict.pop(self.name)], dim=1)
+            else:             raise Exception(f"mode should not be {mode}")
+        return self.chained_proc(attn, hidden_states, encoder_hidden_states, attention_mask)
+class RefOnlyNoisedUNet(torch.nn.Module):
+    def __init__(self, unet, scheduler) -> None:
+        super().__init__()
+        self.unet = unet
+        self.scheduler = scheduler
+        unet_attn_procs = dict()
+        for name, _ in unet.attn_processors.items():
+            if torch.__version__ >= '2.0': default_attn_proc = AttnProcessor2_0()
+            elif is_xformers_available():  default_attn_proc = XFormersAttnProcessor()
+            else:                          default_attn_proc = AttnProcessor()
+            unet_attn_procs[name] = ReferenceOnlyAttnProc(
+                default_attn_proc, enabled=name.endswith("attn1.processor"), name=name
+            )
+        unet.set_attn_processor(unet_attn_procs)
+    def __getattr__(self, name: str):
+        try:
+            return super().__getattr__(name)
+        except AttributeError:
+            return getattr(self.unet, name)
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        class_labels: Optional[torch.Tensor] = None,
+        down_block_res_samples: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_res_sample: Optional[Tuple[torch.Tensor]] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        return_dict: bool = True,
+        **kwargs
+    ):
+        dtype = self.unet.dtype
+        # cond_lat add same level noise
+        cond_lat = cross_attention_kwargs['cond_lat']
+        noise = torch.randn_like(cond_lat)
+        noisy_cond_lat = self.scheduler.add_noise(cond_lat, noise, timestep.reshape(-1))
+        noisy_cond_lat = self.scheduler.scale_model_input(noisy_cond_lat, timestep.reshape(-1))
+        ref_dict = {}
+        _ = self.unet(
+            noisy_cond_lat,
+            timestep,
+            encoder_hidden_states = encoder_hidden_states,
+            class_labels = class_labels,
+            cross_attention_kwargs = dict(mode="w", ref_dict=ref_dict),
+            added_cond_kwargs = added_cond_kwargs,
+            return_dict = return_dict,
+            **kwargs
+        )
+        res = self.unet(
+            sample,
+            timestep,
+            encoder_hidden_states,
+            class_labels=class_labels,
+            cross_attention_kwargs = dict(mode="r", ref_dict=ref_dict),
+            down_block_additional_residuals = [
+                sample.to(dtype=dtype) for sample in down_block_res_samples
+            ] if down_block_res_samples is not None else None,
+            mid_block_additional_residual = (
+                mid_block_res_sample.to(dtype=dtype)
+                if mid_block_res_sample is not None else None),
+            added_cond_kwargs = added_cond_kwargs,
+            return_dict = return_dict,
+            **kwargs
+        )
+        return res
+class HunYuan3D_MVD_Std_Pipeline(diffusers.DiffusionPipeline):
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        feature_extractor_vae: CLIPImageProcessor,
+        vision_processor: CLIPImageProcessor,
+        vision_encoder: CLIPVisionModelWithProjection,
+        vision_encoder_2: CLIPVisionModelWithProjection,
+        ramping_coefficients: Optional[list] = None,
+        add_watermarker: Optional[bool] = None,
+        safety_checker = None,
+    ):
+        DiffusionPipeline.__init__(self)
+        self.register_modules(
+            vae=vae, unet=unet, scheduler=scheduler, safety_checker=None, feature_extractor_vae=feature_extractor_vae,
+            vision_processor=vision_processor, vision_encoder=vision_encoder, vision_encoder_2=vision_encoder_2,
+        )
+        self.register_to_config( ramping_coefficients = ramping_coefficients)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.default_sample_size = self.unet.config.sample_size
+        self.watermark = None
+        self.prepare_init = False
+    def prepare(self):
+        assert isinstance(self.unet, UNet2DConditionModel), "unet should be UNet2DConditionModel"
+        self.unet = RefOnlyNoisedUNet(self.unet, self.scheduler).eval()
+        self.prepare_init = True
+    def encode_image(self, image: torch.Tensor, scale_factor: bool = False):
+        latent = self.vae.encode(image).latent_dist.sample()
+        return (latent * self.vae.config.scaling_factor) if scale_factor else latent
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    def _get_add_time_ids(
+        self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
+    ):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, " \
+                f"but a vector of {passed_add_embed_dim} was created. The model has an incorrect config." \
+                f" Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:  extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator: extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def interrupt(self):
+        return self._interrupt
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: Image.Image = None,
+        guidance_scale = 2.0,
+        output_type: Optional[str] = "pil",
+        num_inference_steps: int = 50,
+        return_dict: bool = True,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        latent: torch.Tensor = None,
+        guidance_curve = None,
+        **kwargs
+    ):
+        if not self.prepare_init:
+            self.prepare()
+        here = dict(device=self.vae.device, dtype=self.vae.dtype)
+        batch_size = 1
+        num_images_per_prompt = 1
+        width, height = 512 * 2,  512 * 3
+        target_size = original_size = (height, width)
+        self._guidance_scale = guidance_scale
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+        device = self._execution_device
+        # Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            self.vae.dtype,
+            device,
+            generator,
+            latents=latent,
+        )
+        # Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # Prepare added time ids & embeddings
+        text_encoder_projection_dim = 1280
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=self.vae.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        negative_add_time_ids = add_time_ids
+        # hw: preprocess
+        cond_image = recenter_img(image)
+        cond_image = to_rgb_image(image)
+        image_vae = self.feature_extractor_vae(images=cond_image, return_tensors="pt").pixel_values.to(**here)
+        image_clip = self.vision_processor(images=cond_image, return_tensors="pt").pixel_values.to(**here)
+        # hw: get cond_lat from cond_img using vae
+        cond_lat = self.encode_image(image_vae, scale_factor=False)
+        negative_lat = self.encode_image(torch.zeros_like(image_vae), scale_factor=False)
+        cond_lat = torch.cat([negative_lat, cond_lat])
+        # hw: get visual global embedding using clip
+        global_embeds_1 = self.vision_encoder(image_clip, output_hidden_states=False).image_embeds.unsqueeze(-2)
+        global_embeds_2 = self.vision_encoder_2(image_clip, output_hidden_states=False).image_embeds.unsqueeze(-2)
+        global_embeds = torch.concat([global_embeds_1, global_embeds_2], dim=-1)
+        ramp = global_embeds.new_tensor(self.config.ramping_coefficients).unsqueeze(-1)
+        prompt_embeds = self.uc_text_emb.to(**here)
+        pooled_prompt_embeds =  self.uc_text_emb_2.to(**here)
+        prompt_embeds = prompt_embeds + global_embeds * ramp
+        add_text_embeds = pooled_prompt_embeds
+        if self.do_classifier_free_guidance:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+        # Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        timestep_cond = None
+        self._num_timesteps = len(timesteps)
+        if guidance_curve is None:
+            guidance_curve = lambda t: guidance_scale
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                 # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # predict the noise residual
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=dict(cond_lat=cond_lat),
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+                # perform guidance
+                # cur_guidance_scale = self.guidance_scale
+                cur_guidance_scale = guidance_curve(t)  # 1.5 + 2.5 * ((t/1000)**2)
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + cur_guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    # cur_guidance_scale_topleft = (cur_guidance_scale - 1.0) * 4 + 1.0
+                    # noise_pred_top_left = noise_pred_uncond +
+                    #    cur_guidance_scale_topleft * (noise_pred_text - noise_pred_uncond)
+                    # _, _, h, w = noise_pred.shape
+                    # noise_pred[:, :, :h//3, :w//2] = noise_pred_top_left[:, :, :h//3, :w//2]
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+        latents = unscale_latents(latents)
+        if output_type=="latent":
+            image = latents
+        else:
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image = unscale_image(unscale_image_2(image)).clamp(0, 1)
+            image = [
+                Image.fromarray((image[0]*255+0.5).clamp_(0, 255).permute(1, 2, 0).cpu().numpy().astype("uint8")),
+                # self.image_processor.postprocess(image, output_type=output_type)[0],
+                cond_image.resize((512, 512))
+            ]
+        if not return_dict: return (image,)
+        return ImagePipelineOutput(images=image)
+    def save_pretrained(self, save_directory):
+        # uc_text_emb.pt and uc_text_emb_2.pt are inferenced and saved in advance
+        super().save_pretrained(save_directory)
+        torch.save(self.uc_text_emb, os.path.join(save_directory, "uc_text_emb.pt"))
+        torch.save(self.uc_text_emb_2, os.path.join(save_directory, "uc_text_emb_2.pt"))
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        # uc_text_emb.pt and uc_text_emb_2.pt are inferenced and saved in advance
+        pipeline = super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        pipeline.uc_text_emb = torch.load(os.path.join(pretrained_model_name_or_path, "uc_text_emb.pt"))
+        pipeline.uc_text_emb_2 = torch.load(os.path.join(pretrained_model_name_or_path, "uc_text_emb_2.pt"))
+        return pipeline

mvd/utils.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# Open Source Model Licensed under the Apache License Version 2.0 and Other Licenses of the Third-Party Components therein:
+# The below Model in this distribution may have been modified by THL A29 Limited ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
+# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
+# The below software and/or models in this distribution may have been
+# modified by THL A29 Limited ("Tencent Modifications").
+# All Tencent Modifications are Copyright (C) THL A29 Limited.
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+import numpy as np
+from PIL import Image
+def to_rgb_image(maybe_rgba: Image.Image):
+    '''
+        convert a PIL.Image to rgb mode with white background
+        maybe_rgba: PIL.Image
+        return: PIL.Image
+    '''
+    if maybe_rgba.mode == 'RGB':
+        return maybe_rgba
+    elif maybe_rgba.mode == 'RGBA':
+        rgba = maybe_rgba
+        img = np.random.randint(255, 256, size=[rgba.size[1], rgba.size[0], 3], dtype=np.uint8)
+        img = Image.fromarray(img, 'RGB')
+        img.paste(rgba, mask=rgba.getchannel('A'))
+        return img
+    else:
+        raise ValueError("Unsupported image type.", maybe_rgba.mode)
+def white_out_background(pil_img, is_gray_fg=True):
+    data = pil_img.getdata()
+    new_data = []
+    #  convert fore-ground white to gray
+    for r, g, b, a in data:
+        if a < 16:
+            new_data.append((255, 255, 255, 0))  # back-ground to be black
+        else:
+            is_white = is_gray_fg and (r>235) and (g>235) and (b>235)
+            new_r = 235 if is_white else r
+            new_g = 235 if is_white else g
+            new_b = 235 if is_white else b
+            new_data.append((new_r, new_g, new_b, a))
+    pil_img.putdata(new_data)
+    return pil_img
+def recenter_img(img, size=512, color=(255,255,255)):
+    img = white_out_background(img)
+    mask = np.array(img)[..., 3]
+    image = np.array(img)[..., :3]
+    H, W, C = image.shape
+    coords = np.nonzero(mask)
+    x_min, x_max = coords[0].min(), coords[0].max()
+    y_min, y_max = coords[1].min(), coords[1].max()
+    h = x_max - x_min
+    w = y_max - y_min
+    if h == 0 or w == 0: raise ValueError
+    roi = image[x_min:x_max, y_min:y_max]
+    border_ratio = 0.15 # 0.2
+    pad_h = int(h * border_ratio)
+    pad_w = int(w * border_ratio)
+    result_tmp = np.full((h + pad_h, w + pad_w, C), color, dtype=np.uint8)
+    result_tmp[pad_h // 2: pad_h // 2 + h, pad_w // 2: pad_w // 2 + w] = roi
+    cur_h, cur_w = result_tmp.shape[:2]
+    side = max(cur_h, cur_w)
+    result = np.full((side, side, C), color, dtype=np.uint8)
+    result[(side-cur_h)//2:(side-cur_h)//2+cur_h, (side-cur_w)//2:(side - cur_w)//2+cur_w,:] = result_tmp
+    result = Image.fromarray(result)
+    return result.resize((size, size), Image.LANCZOS) if size else result

requirements.txt ADDED Viewed

	@@ -0,0 +1,22 @@

+--find-links https://download.pytorch.org/whl/cu118
+torch==2.2.0
+torchvision==0.17.0
+diffusers
+transformers
+rembg
+tqdm
+omegaconf
+matplotlib
+opencv-python
+imageio
+jaxtyping
+einops
+SentencePiece
+accelerate
+trimesh
+PyMCubes
+xatlas
+libigl
+git+https://github.com/facebookresearch/pytorch3d
+git+https://github.com/NVlabs/nvdiffrast
+open3d

scripts/image_to_3d.sh ADDED Viewed

	@@ -0,0 +1,8 @@

+# image to 3d
+python main.py \
+    --image_prompt ./demos/example_000.png \
+    --save_folder ./outputs/test/ \
+    --max_faces_num 90000 \
+    --do_texture \
+    --do_render

scripts/image_to_3d_demo.sh ADDED Viewed

	@@ -0,0 +1,8 @@

+# image to 3d
+python main.py \
+    --image_prompt ./demos/example_000.png \
+    --save_folder ./outputs/test/ \
+    --max_faces_num 90000 \
+    --do_texture_mapping \
+    --do_render

scripts/image_to_3d_fast.sh ADDED Viewed

	@@ -0,0 +1,6 @@

+# image to 3d fast
+python main.py \
+    --image_prompt ./demos/example_000.png \
+    --save_folder ./outputs/test/ \
+    --max_faces_num 10000 \
+    --use_lite

scripts/image_to_3d_fast_demo.sh ADDED Viewed

	@@ -0,0 +1,6 @@

+# image to 3d fast
+python main.py \
+    --image_prompt ./demos/example_000.png \
+    --save_folder ./outputs/test/ \
+    --max_faces_num 10000 \
+    --use_lite

scripts/text_to_3d.sh ADDED Viewed

	@@ -0,0 +1,7 @@

+# text to 3d fast
+python main.py \
+    --text_prompt "a lovely cat" \
+    --save_folder ./outputs/test/ \
+    --max_faces_num 90000 \
+    --do_texture \
+    --do_render

scripts/text_to_3d_demo.sh ADDED Viewed

	@@ -0,0 +1,7 @@

+# text to 3d fast
+python main.py \
+    --text_prompt "a lovely rabbit" \
+    --save_folder ./outputs/test/ \
+    --max_faces_num 90000 \
+    --do_texture_mapping \
+    --do_render

scripts/text_to_3d_fast.sh ADDED Viewed

	@@ -0,0 +1,6 @@

+# text to 3d fast
+python main.py \
+    --text_prompt "一个广式茶杯" \
+    --save_folder ./outputs/test/ \
+    --max_faces_num 10000 \
+    --use_lite

scripts/text_to_3d_fast_demo.sh ADDED Viewed

	@@ -0,0 +1,6 @@

+# text to 3d fast
+python main.py \
+    --text_prompt "一个广式茶杯" \
+    --save_folder ./outputs/test/ \
+    --max_faces_num 10000 \
+    --use_lite

svrm/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

svrm/configs/2024-10-24T22-36-18-project.yaml ADDED Viewed

	@@ -0,0 +1,32 @@

+model:
+  base_learning_rate: 3.0e-05
+  target: svrm.ldm.models.svrm.SVRMModel
+  params:
+    img_encoder_config:
+      target: svrm.ldm.modules.encoders.dinov2_mod.FrozenDinoV2ImageEmbedder
+      params:
+        version: dinov2_vitb14
+    img_to_triplane_config:
+      target: svrm.ldm.modules.translator.img_to_triplane.ImgToTriplaneModel
+      params:
+        pos_emb_size: 64
+        pos_emb_dim: 1024
+        cam_cond_dim: 20
+        n_heads: 16
+        d_head: 64
+        depth: 16
+        context_dim: 768
+        triplane_dim: 120
+        use_fp16: true
+        use_bf16: false
+        upsample_time: 2
+    render_config:
+      target: svrm.ldm.modules.rendering_neus.synthesizer.TriplaneSynthesizer
+      params:
+        triplane_dim: 120
+        samples_per_ray: 128

svrm/configs/svrm.yaml ADDED Viewed

	@@ -0,0 +1,32 @@

+model:
+  base_learning_rate: 3.0e-05
+  target: svrm.ldm.models.svrm.SVRMModel
+  params:
+    img_encoder_config:
+      target: svrm.ldm.modules.encoders.dinov2_mod.FrozenDinoV2ImageEmbedder
+      params:
+        version: dinov2_vitb14
+    img_to_triplane_config:
+      target: svrm.ldm.modules.translator.img_to_triplane.ImgToTriplaneModel
+      params:
+        pos_emb_size: 64
+        pos_emb_dim: 1024
+        cam_cond_dim: 20
+        n_heads: 16
+        d_head: 64
+        depth: 16
+        context_dim: 768
+        triplane_dim: 120
+        use_fp16: true
+        use_bf16: false
+        upsample_time: 2
+    render_config:
+      target: svrm.ldm.modules.rendering_neus.synthesizer.TriplaneSynthesizer
+      params:
+        triplane_dim: 120
+        samples_per_ray: 128

svrm/ldm/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

svrm/ldm/models/svrm.py ADDED Viewed

	@@ -0,0 +1,263 @@

+# Open Source Model Licensed under the Apache License Version 2.0 and Other Licenses of the Third-Party Components therein:
+# The below Model in this distribution may have been modified by THL A29 Limited ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
+# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
+# The below software and/or models in this distribution may have been
+# modified by THL A29 Limited ("Tencent Modifications").
+# All Tencent Modifications are Copyright (C) THL A29 Limited.
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+import os
+import time
+import math
+import cv2
+import numpy as np
+import itertools
+import shutil
+from tqdm import tqdm
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+try:
+    import trimesh
+    import mcubes
+    import xatlas
+    import open3d as o3d
+except:
+    raise "failed to import 3d libraries "
+from ..modules.rendering_neus.mesh import Mesh
+from ..modules.rendering_neus.rasterize import NVDiffRasterizerContext
+from ..utils.ops import scale_tensor
+from ..util import count_params, instantiate_from_config
+from ..vis_util import render
+def unwrap_uv(v_pos, t_pos_idx):
+    print("Using xatlas to perform UV unwrapping, may take a while ...")
+    atlas = xatlas.Atlas()
+    atlas.add_mesh(v_pos, t_pos_idx)
+    atlas.generate(xatlas.ChartOptions(), xatlas.PackOptions())
+    _, indices, uvs = atlas.get_mesh(0)
+    indices = indices.astype(np.int64, casting="same_kind")
+    return uvs, indices
+def uv_padding(image, hole_mask, uv_padding_size = 2):
+    return cv2.inpaint(
+        (image.detach().cpu().numpy() * 255).astype(np.uint8),
+        (hole_mask.detach().cpu().numpy() * 255).astype(np.uint8),
+        uv_padding_size,
+        cv2.INPAINT_TELEA
+    )
+def refine_mesh(vtx_refine, faces_refine):
+    mesh = o3d.geometry.TriangleMesh(
+        vertices=o3d.utility.Vector3dVector(vtx_refine),
+        triangles=o3d.utility.Vector3iVector(faces_refine))
+    mesh = mesh.remove_unreferenced_vertices()
+    mesh = mesh.remove_duplicated_triangles()
+    mesh = mesh.remove_duplicated_vertices()
+    voxel_size = max(mesh.get_max_bound() - mesh.get_min_bound())
+    mesh = mesh.simplify_vertex_clustering(
+        voxel_size=0.007, # 0.005
+        contraction=o3d.geometry.SimplificationContraction.Average)
+    mesh = mesh.filter_smooth_simple(number_of_iterations=2)
+    vtx_refine = np.asarray(mesh.vertices).astype(np.float32)
+    faces_refine = np.asarray(mesh.triangles)
+    return vtx_refine, faces_refine, mesh
+class SVRMModel(torch.nn.Module):
+    def __init__(
+        self,
+        img_encoder_config,
+        img_to_triplane_config,
+        render_config,
+        device = "cuda:0",
+        **kwargs
+    ):
+        super().__init__()
+        self.img_encoder = instantiate_from_config(img_encoder_config).half()
+        self.img_to_triplane_decoder = instantiate_from_config(img_to_triplane_config).half()
+        self.render = instantiate_from_config(render_config).half()
+        self.device = device
+        count_params(self, verbose=True)
+    @torch.no_grad()
+    def export_mesh_with_uv(
+        self,
+        data,
+        mesh_size: int = 384,
+        ctx = None,
+        context_type = 'cuda',
+        texture_res = 1024,
+        target_face_count = 10000,
+        do_texture_mapping = True,
+        out_dir = 'outputs/test'
+    ):
+        """
+        color_type: 0 for ray texture, 1 for vertices texture
+        """
+        st = time.time()
+        here = {'device': self.device, 'dtype': torch.float16}
+        input_view_image = data["input_view"].to(**here)    # [b, m, c, h, w]
+        input_view_cam = data["input_view_cam"].to(**here)  # [b, m, 20]
+        batch_size, input_view_num, *_ = input_view_image.shape
+        assert batch_size == 1, "batch size should be 1"
+        input_view_image = rearrange(input_view_image, 'b m c h w -> (b m) c h w')
+        input_view_cam = rearrange(input_view_cam, 'b m d -> (b m) d')
+        input_view_feat = self.img_encoder(input_view_image, input_view_cam)
+        input_view_feat = rearrange(input_view_feat, '(b m) l d -> b (l m) d', m=input_view_num)
+        # -- decoder
+        torch.cuda.empty_cache()
+        triplane_gen = self.img_to_triplane_decoder(input_view_feat)  # [b, 3, tri_dim, h, w]
+        del input_view_feat
+        torch.cuda.empty_cache()
+        # --- triplane nerf render
+        cur_triplane = triplane_gen[0:1]
+        aabb = torch.tensor([[-0.6, -0.6, -0.6], [0.6, 0.6, 0.6]]).unsqueeze(0).to(**here)
+        grid_out = self.render.forward_grid(planes=cur_triplane, grid_size=mesh_size, aabb=aabb)
+        print(f"=====> LRM forward time: {time.time() - st}")
+        st = time.time()
+        vtx, faces = mcubes.marching_cubes(0. - grid_out['sdf'].squeeze(0).squeeze(-1).cpu().float().numpy(), 0)
+        bbox = aabb[0].cpu().numpy()
+        vtx = vtx / (mesh_size - 1)
+        vtx = vtx * (bbox[1] - bbox[0]) + bbox[0]
+        # refine mesh
+        vtx_refine, faces_refine, mesh = refine_mesh(vtx, faces)
+        # reduce faces
+        if faces_refine.shape[0] > target_face_count:
+            print(f"reduce face: {faces_refine.shape[0]} -> {target_face_count}")
+            mesh = o3d.geometry.TriangleMesh(
+                vertices = o3d.utility.Vector3dVector(vtx_refine),
+                triangles = o3d.utility.Vector3iVector(faces_refine)
+            )
+            # Function to simplify mesh using Quadric Error Metric Decimation by Garland and Heckbert
+            mesh = mesh.simplify_quadric_decimation(target_face_count, boundary_weight=1.0)
+            mesh = Mesh(
+                v_pos = torch.from_numpy(np.asarray(mesh.vertices)).to(self.device),
+                t_pos_idx = torch.from_numpy(np.asarray(mesh.triangles)).to(self.device),
+                v_rgb = torch.from_numpy(np.asarray(mesh.vertex_colors)).to(self.device)
+            )
+            vtx_refine = mesh.v_pos.cpu().numpy()
+            faces_refine = mesh.t_pos_idx.cpu().numpy()
+        vtx_colors = self.render.forward_points(cur_triplane, torch.tensor(vtx_refine).unsqueeze(0).to(**here))
+        vtx_colors = vtx_colors['rgb'].float().squeeze(0).cpu().numpy()
+        color_ratio = 0.8 # increase brightness
+        with open(f'{out_dir}/mesh_with_colors.obj', 'w') as fid:
+            verts = vtx_refine[:, [1,2,0]]
+            for pidx, pp in enumerate(verts):
+                color = vtx_colors[pidx]
+                color = [color[0]**color_ratio, color[1]**color_ratio, color[2]**color_ratio]
+                fid.write('v %f %f %f %f %f %f\n' % (pp[0], pp[1], pp[2], color[0], color[1], color[2]))
+            for i, f in enumerate(faces_refine):
+                f1 = f + 1
+                fid.write('f %d %d %d\n' % (f1[0], f1[1], f1[2]))
+        mesh = trimesh.load_mesh(f'{out_dir}/mesh_with_colors.obj')
+        print(f"=====> generate mesh with vertex shading time: {time.time() - st}")
+        st = time.time()
+        if not do_texture_mapping:
+            shutil.copy(f'{out_dir}/mesh_with_colors.obj', f'{out_dir}/mesh.obj')
+            mesh.export(f'{out_dir}/mesh.glb', file_type='glb')
+            return None
+        ##########  export texture  ########
+        st = time.time()
+        # uv unwrap
+        vtx_tex, t_tex_idx = unwrap_uv(vtx_refine, faces_refine)
+        vtx_refine   = torch.from_numpy(vtx_refine).to(self.device)
+        faces_refine = torch.from_numpy(faces_refine).to(self.device)
+        t_tex_idx    = torch.from_numpy(t_tex_idx).to(self.device)
+        uv_clip      = torch.from_numpy(vtx_tex * 2.0 - 1.0).to(self.device)
+         # rasterize
+        ctx = NVDiffRasterizerContext(context_type, cur_triplane.device) if ctx is None else ctx
+        rast = ctx.rasterize_one(
+            torch.cat([
+                uv_clip,
+                torch.zeros_like(uv_clip[..., 0:1]),
+                torch.ones_like(uv_clip[..., 0:1])
+            ], dim=-1),
+            t_tex_idx,
+            (texture_res, texture_res)
+        )[0]
+        hole_mask = ~(rast[:, :, 3] > 0)
+        # Interpolate world space position
+        gb_pos = ctx.interpolate_one(vtx_refine, rast[None, ...], faces_refine)[0][0]
+        with torch.no_grad():
+            gb_mask_pos_scale = scale_tensor(gb_pos.unsqueeze(0).view(1, -1, 3), (-1, 1), (-1, 1))
+            tex_map = self.render.forward_points(cur_triplane, gb_mask_pos_scale)['rgb']
+            tex_map = tex_map.float().squeeze(0)  # (0, 1)
+            tex_map = tex_map.view((texture_res, texture_res, 3))
+            img = uv_padding(tex_map, hole_mask)
+            img = ((img/255.0) ** color_ratio) * 255  # increase brightness
+            img = img.clip(0, 255).astype(np.uint8)
+        verts = vtx_refine.cpu().numpy()[:, [1,2,0]]
+        faces = faces_refine.cpu().numpy()
+        with open(f'{out_dir}/texture.mtl', 'w') as fid:
+            fid.write('newmtl material_0\n')
+            fid.write("Ka 1.000 1.000 1.000\n")
+            fid.write("Kd 1.000 1.000 1.000\n")
+            fid.write("Ks 0.000 0.000 0.000\n")
+            fid.write("d 1.0\n")
+            fid.write("illum 2\n")
+            fid.write(f'map_Kd texture.png\n')
+        with open(f'{out_dir}/mesh.obj', 'w') as fid:
+            fid.write(f'mtllib texture.mtl\n')
+            for pidx, pp in enumerate(verts):
+                fid.write('v %f %f %f\n' % (pp[0], pp[1], pp[2]))
+            for pidx, pp in enumerate(vtx_tex):
+                fid.write('vt %f %f\n' % (pp[0], 1 - pp[1]))
+            fid.write('usemtl material_0\n')
+            for i, f in enumerate(faces):
+                f1 = f + 1
+                f2 = t_tex_idx[i] + 1
+                fid.write('f %d/%d %d/%d %d/%d\n' % (f1[0], f2[0], f1[1], f2[1], f1[2], f2[2],))
+        cv2.imwrite(f'{out_dir}/texture.png', img[..., [2, 1, 0]])
+        mesh = trimesh.load_mesh(f'{out_dir}/mesh.obj')
+        mesh.export(f'{out_dir}/mesh.glb', file_type='glb')

svrm/ldm/modules/attention.py ADDED Viewed

	@@ -0,0 +1,457 @@

+from inspect import isfunction
+import math
+import torch
+import torch.nn.functional as F
+from torch import nn, einsum
+from einops import rearrange, repeat
+import numpy as np
+FLASH_IS_AVAILABLE = XFORMERS_IS_AVAILBLE = False
+try:
+    from flash_attn import flash_attn_qkvpacked_func, flash_attn_func
+    FLASH_IS_AVAILABLE = True
+except:
+    try:
+        import xformers
+        import xformers.ops
+        XFORMERS_IS_AVAILBLE = True
+    except:
+        pass
+def exists(val):
+    return val is not None
+def uniq(arr):
+    return{el: True for el in arr}.keys()
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+def max_neg_value(t):
+    return -torch.finfo(t.dtype).max
+def init_(tensor):
+    dim = tensor.shape[-1]
+    std = 1 / math.sqrt(dim)
+    tensor.uniform_(-std, std)
+    return tensor
+def checkpoint(func, inputs, params, flag):
+    """
+    Evaluate a function without caching intermediate activations, allowing for
+    reduced memory at the expense of extra compute in the backward pass.
+    :param func: the function to evaluate.
+    :param inputs: the argument sequence to pass to `func`.
+    :param params: a sequence of parameters `func` depends on but does not
+                   explicitly take as arguments.
+    :param flag: if False, disable gradient checkpointing.
+    """
+    if flag:
+        args = tuple(inputs) + tuple(params)
+        return CheckpointFunction.apply(func, len(inputs), *args)
+    else:
+        return func(*inputs)
+class CheckpointFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, run_function, length, *args):
+        ctx.run_function = run_function
+        ctx.input_tensors = list(args[:length])
+        ctx.input_params = list(args[length:])
+        with torch.no_grad():
+            output_tensors = ctx.run_function(*ctx.input_tensors)
+        return output_tensors
+    @staticmethod
+    def backward(ctx, *output_grads):
+        ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors]
+        with torch.enable_grad():
+            # Fixes a bug where the first op in run_function modifies the
+            # Tensor storage in place, which is not allowed for detach()'d
+            # Tensors.
+            shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
+            output_tensors = ctx.run_function(*shallow_copies)
+        input_grads = torch.autograd.grad(
+            output_tensors,
+            ctx.input_tensors + ctx.input_params,
+            output_grads,
+            allow_unused=True,
+        )
+        del ctx.input_tensors
+        del ctx.input_params
+        del output_tensors
+        return (None, None) + input_grads
+# feedforward
+class GEGLU(nn.Module):
+    def __init__(self, dim_in, dim_out):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2)
+    def forward(self, x):
+        x, gate = self.proj(x).chunk(2, dim=-1)
+        return x * F.gelu(gate)
+class FeedForward(nn.Module):
+    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = default(dim_out, dim)
+        project_in = nn.Sequential(
+            nn.Linear(dim, inner_dim),
+            nn.GELU()
+        ) if not glu else GEGLU(dim, inner_dim)
+        self.net = nn.Sequential(
+            project_in,
+            nn.Dropout(dropout),
+            nn.Linear(inner_dim, dim_out)
+        )
+    def forward(self, x):
+        return self.net(x)
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+def Normalize(in_channels):
+    return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+class LinearAttention(nn.Module):
+    def __init__(self, dim, heads=4, dim_head=32):
+        super().__init__()
+        self.heads = heads
+        hidden_dim = dim_head * heads
+        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias = False)
+        self.to_out = nn.Conv2d(hidden_dim, dim, 1)
+    def forward(self, x):
+        b, c, h, w = x.shape
+        qkv = self.to_qkv(x)
+        q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)', heads = self.heads, qkv=3)
+        k = k.softmax(dim=-1)
+        context = torch.einsum('bhdn,bhen->bhde', k, v)
+        out = torch.einsum('bhde,bhdn->bhen', context, q)
+        out = rearrange(out, 'b heads c (h w) -> b (heads c) h w', heads=self.heads, h=h, w=w)
+        return self.to_out(out)
+class SpatialSelfAttention(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.k = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.v = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.proj_out = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=1,
+                                        stride=1,
+                                        padding=0)
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b,c,h,w = q.shape
+        q = rearrange(q, 'b c h w -> b (h w) c')
+        k = rearrange(k, 'b c h w -> b c (h w)')
+        w_ = torch.einsum('bij,bjk->bik', q, k)
+        w_ = w_ * (int(c)**(-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # attend to values
+        v = rearrange(v, 'b c h w -> b c (h w)')
+        w_ = rearrange(w_, 'b i j -> b j i')
+        h_ = torch.einsum('bij,bjk->bik', v, w_)
+        h_ = rearrange(h_, 'b c (h w) -> b c h w', h=h)
+        h_ = self.proj_out(h_)
+        return x+h_
+class CrossAttention(nn.Module):
+    def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.):
+        super().__init__()
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+        self.scale = dim_head ** -0.5
+        self.heads = heads
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim),
+            nn.Dropout(dropout)
+        )
+    def forward(self, x, context=None, mask=None):
+        h = self.heads
+        q = self.to_q(x)
+        context = default(context, x)
+        k = self.to_k(context)
+        v = self.to_v(context)
+        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
+        sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
+        if exists(mask):
+            mask = rearrange(mask, 'b ... -> b (...)')
+            max_neg_value = -torch.finfo(sim.dtype).max
+            mask = repeat(mask, 'b j -> (b h) () j', h=h)
+            sim.masked_fill_(~mask, max_neg_value)
+        # attention, what we cannot get enough of
+        attn = sim.softmax(dim=-1)
+        out = einsum('b i j, b j d -> b i d', attn, v) # [b*h, n, d]
+        out = rearrange(out, '(b h) n d -> b n (h d)', h=h)
+        return self.to_out(out)
+class FlashAttention(nn.Module):
+    def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.):
+        super().__init__()
+        print(f"Setting up {self.__class__.__name__}. Query dim is {query_dim}, context_dim is {context_dim} and using "
+              f"{heads} heads.")
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+        self.scale = dim_head ** -0.5
+        self.heads = heads
+        self.dropout = dropout
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim),
+            nn.Dropout(dropout)
+        )
+    def forward(self, x, context=None, mask=None):
+        context = default(context, x)
+        h = self.heads
+        dtype = torch.bfloat16 # torch.half
+        q = self.to_q(x).to(dtype)
+        k = self.to_k(context).to(dtype)
+        v = self.to_v(context).to(dtype)
+        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b n h d', h=h), (q, k, v)) # q is [b, 3079, 16, 64]
+        out = flash_attn_func(q, k, v, dropout_p=self.dropout, softmax_scale=None, causal=False, window_size=(-1, -1)) # out is same shape to q
+        out = rearrange(out, 'b n h d -> b n (h d)', h=h)
+        return self.to_out(out.float())
+class MemoryEfficientCrossAttention(nn.Module):
+    # https://github.com/MatthieuTPHR/diffusers/blob/d80b531ff8060ec1ea982b65a1b8df70f73aa67c/src/diffusers/models/attention.py#L223
+    def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.0):
+        super().__init__()
+        print(f"Setting up {self.__class__.__name__}. Query dim is {query_dim}, context_dim is {context_dim} and using "
+              f"{heads} heads.")
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+        self.heads = heads
+        self.dim_head = dim_head
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_out = nn.Sequential(nn.Linear(inner_dim, query_dim), nn.Dropout(dropout))
+        self.attention_op: Optional[Any] = None
+    def forward(self, x, context=None, mask=None):
+        q = self.to_q(x)
+        context = default(context, x)
+        k = self.to_k(context)
+        v = self.to_v(context)
+        b, _, _ = q.shape
+        q, k, v = map(
+            lambda t: t.unsqueeze(3)
+            .reshape(b, t.shape[1], self.heads, self.dim_head)
+            .permute(0, 2, 1, 3)
+            .reshape(b * self.heads, t.shape[1], self.dim_head)
+            .contiguous(),
+            (q, k, v),
+        )
+        # actually compute the attention, what we cannot get enough of
+        out = xformers.ops.memory_efficient_attention(q, k, v, attn_bias=None, op=self.attention_op)
+        if exists(mask):
+            raise NotImplementedError
+        out = (
+            out.unsqueeze(0)
+            .reshape(b, self.heads, out.shape[1], self.dim_head)
+            .permute(0, 2, 1, 3)
+            .reshape(b, out.shape[1], self.heads * self.dim_head)
+        )
+        return self.to_out(out)
+class BasicTransformerBlock(nn.Module):
+    def __init__(self, dim, n_heads, d_head, dropout=0., context_dim=None, gated_ff=True, checkpoint=True,
+                 disable_self_attn=False):
+        super().__init__()
+        self.disable_self_attn = disable_self_attn
+        self.attn1 = CrossAttention(query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout,
+                                    context_dim=context_dim if self.disable_self_attn else None)  # is a self-attention if not self.disable_self_attn
+        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
+        self.attn2 = CrossAttention(query_dim=dim, context_dim=context_dim,
+                                    heads=n_heads, dim_head=d_head, dropout=dropout)  # is self-attn if context is none
+        self.norm1 = Fp32LayerNorm(dim)
+        self.norm2 = Fp32LayerNorm(dim)
+        self.norm3 = Fp32LayerNorm(dim)
+        self.checkpoint = checkpoint
+    def forward(self, x, context=None):
+        return checkpoint(self._forward, (x, context), self.parameters(), self.checkpoint)
+    def _forward(self, x, context=None):
+        x = self.attn1(self.norm1(x), context=context if self.disable_self_attn else None) + x
+        x = self.attn2(self.norm2(x), context=context) + x
+        x = self.ff(self.norm3(x)) + x
+        return x
+ATTENTION_MODES = {
+    "softmax": CrossAttention,  # vanilla attention
+    "softmax-xformers": MemoryEfficientCrossAttention,
+    "softmax-flash": FlashAttention
+}
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+class Fp32LayerNorm(nn.LayerNorm):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def forward(self, x):
+        return super().forward(x.float()).type(x.dtype)
+class AdaNorm(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(dim, 2 * dim, bias=True)
+        )
+        self.norm = Fp32LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+    def forward(self, x, c):  # x is fp32, c is fp16
+        shift, scale = self.adaLN_modulation(c.float()).chunk(2, dim=1) # bf16
+        x = modulate(self.norm(x), shift, scale) # fp32
+        return x
+class BasicTransformerBlockLRM(nn.Module):
+    def __init__(self, dim, n_heads, d_head, dropout=0., context_dim=None, gated_ff=True, \
+                 checkpoint=True):
+        super().__init__()
+        attn_mode = "softmax-xformers" if XFORMERS_IS_AVAILBLE else "softmax"
+        attn_mode = "softmax-flash" if FLASH_IS_AVAILABLE else attn_mode
+        assert attn_mode in ATTENTION_MODES
+        attn_cls = ATTENTION_MODES[attn_mode]
+        self.attn1 = attn_cls(query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout, \
+                              context_dim=context_dim) # cross-attn
+        self.attn2 = attn_cls(query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout, \
+                              context_dim=None) # self-attn
+        self.norm1 = Fp32LayerNorm(dim)
+        self.norm2 = Fp32LayerNorm(dim)
+        self.norm3 = Fp32LayerNorm(dim)
+        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
+        self.checkpoint = checkpoint
+    def forward(self, x, context=None, cam_emb=None): # (torch.float32, torch.float32, torch.bfloat16)
+        return checkpoint(self._forward, (x, context), self.parameters(), self.checkpoint)
+    def _forward(self, x, context=None, cam_emb=None):
+        x = self.attn1(self.norm1(x), context=context) + x  # cross-attn
+        x = self.attn2(self.norm2(x), context=None) + x # self-attn
+        x = self.ff(self.norm3(x)) + x
+        return x
+class ImgToTriplaneTransformer(nn.Module):
+    """
+    Transformer block for image-like data.
+    First, project the input (aka embedding)
+    and reshape to b, t, d.
+    Then apply standard transformer action.
+    Finally, reshape to image
+    """
+    def __init__(self, query_dim, n_heads, d_head, depth=1, dropout=0., context_dim=None, triplane_size=64):
+        super().__init__()
+        self.transformer_blocks = nn.ModuleList([
+            BasicTransformerBlockLRM(query_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim)
+            for d in range(depth)])
+        self.norm = Fp32LayerNorm(query_dim, eps=1e-6)
+        self.initialize_weights()
+    def initialize_weights(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+            elif isinstance(module, nn.LayerNorm):
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+                if module.weight is not None:
+                    nn.init.constant_(module.weight, 1.0)
+        self.apply(_basic_init)
+    def forward(self, x, context=None, cam_emb=None):
+        # note: if no context is given, cross-attention defaults to self-attention
+        for block in self.transformer_blocks:
+            x = block(x, context=context)
+        x = self.norm(x)
+        return x

svrm/ldm/modules/encoders/__init__.py ADDED Viewed

File without changes

svrm/ldm/modules/encoders/dinov2/__init__.py ADDED Viewed

File without changes

svrm/ldm/modules/encoders/dinov2/hub/__init__.py ADDED Viewed

File without changes

svrm/ldm/modules/encoders/dinov2/hub/backbones.py ADDED Viewed

	@@ -0,0 +1,156 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+from enum import Enum
+from typing import Union
+import torch
+from .utils import _DINOV2_BASE_URL, _make_dinov2_model_name
+class Weights(Enum):
+    LVD142M = "LVD142M"
+def _make_dinov2_model(
+    *,
+    arch_name: str = "vit_large",
+    img_size: int = 518,
+    patch_size: int = 14,
+    init_values: float = 1.0,
+    ffn_layer: str = "mlp",
+    block_chunks: int = 0,
+    num_register_tokens: int = 0,
+    interpolate_antialias: bool = False,
+    interpolate_offset: float = 0.1,
+    pretrained: bool = True,
+    weights: Union[Weights, str] = Weights.LVD142M,
+    **kwargs,
+):
+    from ..models import vision_transformer as vits
+    if isinstance(weights, str):
+        try:
+            weights = Weights[weights]
+        except KeyError:
+            raise AssertionError(f"Unsupported weights: {weights}")
+    model_base_name = _make_dinov2_model_name(arch_name, patch_size)
+    vit_kwargs = dict(
+        img_size=img_size,
+        patch_size=patch_size,
+        init_values=init_values,
+        ffn_layer=ffn_layer,
+        block_chunks=block_chunks,
+        num_register_tokens=num_register_tokens,
+        interpolate_antialias=interpolate_antialias,
+        interpolate_offset=interpolate_offset,
+    )
+    vit_kwargs.update(**kwargs)
+    model = vits.__dict__[arch_name](**vit_kwargs)
+    if pretrained:
+        model_full_name = _make_dinov2_model_name(arch_name, patch_size, num_register_tokens)
+        url = _DINOV2_BASE_URL + f"/{model_base_name}/{model_full_name}_pretrain.pth"
+        state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
+        model.load_state_dict(state_dict, strict=True)
+    return model
+def dinov2_vits14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-S/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_small", pretrained=pretrained, weights=weights, **kwargs)
+def dinov2_vitb14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-B/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_base", pretrained=pretrained, weights=weights, **kwargs)
+def dinov2_vitl14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-L/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_large", pretrained=pretrained, weights=weights, **kwargs)
+def dinov2_vitg14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-g/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_giant2",
+        ffn_layer="swiglufused",
+        weights=weights,
+        pretrained=pretrained,
+        **kwargs,
+    )
+def dinov2_vits14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-S/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_small",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+def dinov2_vitb14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-B/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_base",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+def dinov2_vitl14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-L/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_large",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+def dinov2_vitg14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-g/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_giant2",
+        ffn_layer="swiglufused",
+        weights=weights,
+        pretrained=pretrained,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )

svrm/ldm/modules/encoders/dinov2/hub/utils.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import itertools
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+_DINOV2_BASE_URL = "https://dl.fbaipublicfiles.com/dinov2"
+def _make_dinov2_model_name(arch_name: str, patch_size: int, num_register_tokens: int = 0) -> str:
+    compact_arch_name = arch_name.replace("_", "")[:4]
+    registers_suffix = f"_reg{num_register_tokens}" if num_register_tokens else ""
+    return f"dinov2_{compact_arch_name}{patch_size}{registers_suffix}"
+class CenterPadding(nn.Module):
+    def __init__(self, multiple):
+        super().__init__()
+        self.multiple = multiple
+    def _get_pad(self, size):
+        new_size = math.ceil(size / self.multiple) * self.multiple
+        pad_size = new_size - size
+        pad_size_left = pad_size // 2
+        pad_size_right = pad_size - pad_size_left
+        return pad_size_left, pad_size_right
+    @torch.inference_mode()
+    def forward(self, x):
+        pads = list(itertools.chain.from_iterable(self._get_pad(m) for m in x.shape[:1:-1]))
+        output = F.pad(x, pads)
+        return output

svrm/ldm/modules/encoders/dinov2/layers/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+from .dino_head import DINOHead
+from .mlp import Mlp
+from .patch_embed import PatchEmbed
+from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
+from .block import NestedTensorBlockMod
+from .attention import MemEffAttention

svrm/ldm/modules/encoders/dinov2/layers/attention.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+import logging
+import os
+import warnings
+from torch import Tensor
+from torch import nn
+logger = logging.getLogger("dinov2")
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import memory_efficient_attention, unbind
+        XFORMERS_AVAILABLE = True
+        warnings.warn("xFormers is available (Attention)")
+    else:
+        warnings.warn("xFormers is disabled (Attention)")
+        raise ImportError
+except ImportError:
+    XFORMERS_AVAILABLE = False
+    warnings.warn("xFormers is not available (Attention)")
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: Tensor) -> Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+        attn = q @ k.transpose(-2, -1)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class MemEffAttention(Attention):
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        if not XFORMERS_AVAILABLE:
+            if attn_bias is not None:
+                raise AssertionError("xFormers is required for using nested tensors")
+            return super().forward(x)
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        q, k, v = unbind(qkv, 2)
+        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        x = x.reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x

svrm/ldm/modules/encoders/dinov2/layers/block.py ADDED Viewed

	@@ -0,0 +1,269 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+import os
+import logging
+import warnings
+from typing import Callable, List, Any, Tuple, Dict
+import torch
+from torch import nn, Tensor
+from .attention import Attention, MemEffAttention
+from .drop_path import DropPath
+from .layer_scale import LayerScale
+from .mlp import Mlp
+from ....attention import AdaNorm
+logger = logging.getLogger("dinov2")
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import fmha, scaled_index_add, index_select_cat
+        XFORMERS_AVAILABLE = True
+        warnings.warn("xFormers is available (Block)")
+    else:
+        warnings.warn("xFormers is disabled (Block)")
+        raise ImportError
+except ImportError:
+    XFORMERS_AVAILABLE = False
+    warnings.warn("xFormers is not available (Block)")
+class BlockMod(nn.Module):
+    '''
+        using Modified Block, see below
+    '''
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = AdaNorm,
+        attn_class: Callable[..., nn.Module] = Attention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+    ) -> None:
+        super().__init__()
+        # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.sample_drop_ratio = drop_path
+    def forward(self, x: Tensor, cam_emb: Tensor) -> Tensor:
+        def attn_residual_func(x: Tensor, cam_emb: Tensor = None) -> Tensor:
+            return self.ls1(self.attn(self.norm1(x, cam_emb)))
+        def ffn_residual_func(x: Tensor, cam_emb: Tensor = None) -> Tensor:
+            return self.ls2(self.mlp(self.norm2(x, cam_emb)))
+        if self.training and self.sample_drop_ratio > 0.1:
+            # the overhead is compensated only for a drop path rate larger than 0.1
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+        elif self.training and self.sample_drop_ratio > 0.0:
+            x = x + self.drop_path1(attn_residual_func(x, cam_emb))
+            x = x + self.drop_path1(ffn_residual_func(x, cam_emb))  # FIXME: drop_path2
+        else:
+            x = x + attn_residual_func(x, cam_emb)
+            x = x + ffn_residual_func(x, cam_emb)
+        return x
+def drop_add_residual_stochastic_depth(
+    x: Tensor,
+    residual_func: Callable[[Tensor], Tensor],
+    sample_drop_ratio: float = 0.0,
+) -> Tensor:
+    # drop_add_residual_stochastic_depth_list
+    # 1) extract subset using permutation
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    x_subset = x[brange]
+    # 2) apply residual_func to get residual
+    residual = residual_func(x_subset)
+    x_flat = x.flatten(1)
+    residual = residual.flatten(1)
+    residual_scale_factor = b / sample_subset_size
+    # 3) add the residual
+    x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    return x_plus_residual.view_as(x)
+def get_branges_scales(x, sample_drop_ratio=0.0):
+    # get_branges_scales
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    residual_scale_factor = b / sample_subset_size
+    return brange, residual_scale_factor
+def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
+    # add residuals
+    if scaling_vector is None:
+        x_flat = x.flatten(1)
+        residual = residual.flatten(1)
+        x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    else:
+        x_plus_residual = scaled_index_add(
+            x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
+        )
+    return x_plus_residual
+attn_bias_cache: Dict[Tuple, Any] = {}
+def get_attn_bias_and_cat(x_list, branges=None):
+    """
+    this will perform the index select, cat the tensors, and provide the attn_bias from cache
+    """
+    batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
+    all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
+    if all_shapes not in attn_bias_cache.keys():
+        seqlens = []
+        for b, x in zip(batch_sizes, x_list):
+            for _ in range(b):
+                seqlens.append(x.shape[1])
+        attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
+        attn_bias._batch_sizes = batch_sizes
+        attn_bias_cache[all_shapes] = attn_bias
+    if branges is not None:
+        cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
+    else:
+        tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
+        cat_tensors = torch.cat(tensors_bs1, dim=1)
+    return attn_bias_cache[all_shapes], cat_tensors
+def drop_add_residual_stochastic_list(
+    x_list: List[Tensor],
+    residual_func: Callable[[Tensor, Any], Tensor],
+    sample_drop_ratio: float = 0.0,
+    scaling_vector=None,
+) -> Tensor:
+    # 1) generate random set of indices for dropping samples in the batch
+    branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
+    branges = [s[0] for s in branges_scales]
+    residual_scale_factors = [s[1] for s in branges_scales]
+    # 2) get attention bias and index+concat the tensors
+    attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
+    # 3) apply residual_func to get residual, and split the result
+    residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias))  # type: ignore
+    outputs = []
+    for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
+        outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
+    return outputs
+class NestedTensorBlockMod(BlockMod):
+    def forward_nested(self, x_list: List[Tensor], cam_emb_list: List[Tensor]) -> List[Tensor]:
+        """
+        x_list contains a list of tensors to nest together and run
+        """
+        assert isinstance(self.attn, MemEffAttention)
+        if self.training and self.sample_drop_ratio > 0.0:
+            def attn_residual_func(x: Tensor, cam_emb: Tensor, attn_bias=None) -> Tensor:
+                return self.attn(self.norm1(x, cam_emb), attn_bias=attn_bias)
+            def ffn_residual_func(x: Tensor, cam_emb: Tensor, attn_bias=None) -> Tensor:
+                return self.mlp(self.norm2(x, cam_emb))
+            x_list = drop_add_residual_stochastic_list(
+                x_list,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            x_list = drop_add_residual_stochastic_list(
+                x_list,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            return x_list
+        else:
+            def attn_residual_func(x: Tensor, cam_emb: Tensor, attn_bias=None) -> Tensor:
+                return self.ls1(self.attn(self.norm1(x, cam_emb), attn_bias=attn_bias))
+            def ffn_residual_func(x: Tensor, cam_emb: Tensor, attn_bias=None) -> Tensor:
+                return self.ls2(self.mlp(self.norm2(x, cam_emb)))
+            attn_bias, x = get_attn_bias_and_cat(x_list)
+            x = x + attn_residual_func(x, attn_bias=attn_bias)
+            x = x + ffn_residual_func(x)
+            return attn_bias.split(x)
+    def forward(self, x_or_x_list, cam_emb_or_cam_emb_list):
+        if isinstance(x_or_x_list, Tensor) and isinstance(cam_emb_or_cam_emb_list, Tensor) :
+            return super().forward(x_or_x_list, cam_emb_or_cam_emb_list)
+        elif isinstance(x_or_x_list, list) and isinstance(cam_emb_or_cam_emb_list, list):
+            if not XFORMERS_AVAILABLE:
+                raise AssertionError("xFormers is required for using nested tensors")
+            return self.forward_nested(x_or_x_list, cam_emb_or_cam_emb_list)
+        else:
+            raise AssertionError

svrm/ldm/modules/encoders/dinov2/layers/dino_head.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+from torch.nn.init import trunc_normal_
+from torch.nn.utils import weight_norm
+class DINOHead(nn.Module):
+    def __init__(
+        self,
+        in_dim,
+        out_dim,
+        use_bn=False,
+        nlayers=3,
+        hidden_dim=2048,
+        bottleneck_dim=256,
+        mlp_bias=True,
+    ):
+        super().__init__()
+        nlayers = max(nlayers, 1)
+        self.mlp = _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=hidden_dim, use_bn=use_bn, bias=mlp_bias)
+        self.apply(self._init_weights)
+        self.last_layer = weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False))
+        self.last_layer.weight_g.data.fill_(1)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+    def forward(self, x):
+        x = self.mlp(x)
+        eps = 1e-6 if x.dtype == torch.float16 else 1e-12
+        x = nn.functional.normalize(x, dim=-1, p=2, eps=eps)
+        x = self.last_layer(x)
+        return x
+def _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True):
+    if nlayers == 1:
+        return nn.Linear(in_dim, bottleneck_dim, bias=bias)
+    else:
+        layers = [nn.Linear(in_dim, hidden_dim, bias=bias)]
+        if use_bn:
+            layers.append(nn.BatchNorm1d(hidden_dim))
+        layers.append(nn.GELU())
+        for _ in range(nlayers - 2):
+            layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias))
+            if use_bn:
+                layers.append(nn.BatchNorm1d(hidden_dim))
+            layers.append(nn.GELU())
+        layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=bias))
+        return nn.Sequential(*layers)

svrm/ldm/modules/encoders/dinov2/layers/drop_path.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
+from torch import nn
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0:
+        random_tensor.div_(keep_prob)
+    output = x * random_tensor
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)

svrm/ldm/modules/encoders/dinov2/layers/layer_scale.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
+from typing import Union
+import torch
+from torch import Tensor
+from torch import nn
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: Union[float, Tensor] = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x: Tensor) -> Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma