diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..c690e034c281deff6d2710c09bf60a0c050fbd52 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,35 +1 @@ -*.7z filter=lfs diff=lfs merge=lfs -text -*.arrow filter=lfs diff=lfs merge=lfs -text -*.bin filter=lfs diff=lfs merge=lfs -text -*.bz2 filter=lfs diff=lfs merge=lfs -text -*.ckpt filter=lfs diff=lfs merge=lfs -text -*.ftz filter=lfs diff=lfs merge=lfs -text -*.gz filter=lfs diff=lfs merge=lfs -text -*.h5 filter=lfs diff=lfs merge=lfs -text -*.joblib filter=lfs diff=lfs merge=lfs -text -*.lfs.* filter=lfs diff=lfs merge=lfs -text -*.mlmodel filter=lfs diff=lfs merge=lfs -text -*.model filter=lfs diff=lfs merge=lfs -text -*.msgpack filter=lfs diff=lfs merge=lfs -text -*.npy filter=lfs diff=lfs merge=lfs -text -*.npz filter=lfs diff=lfs merge=lfs -text -*.onnx filter=lfs diff=lfs merge=lfs -text -*.ot filter=lfs diff=lfs merge=lfs -text -*.parquet filter=lfs diff=lfs merge=lfs -text -*.pb filter=lfs diff=lfs merge=lfs -text -*.pickle filter=lfs diff=lfs merge=lfs -text -*.pkl filter=lfs diff=lfs merge=lfs -text -*.pt filter=lfs diff=lfs merge=lfs -text -*.pth filter=lfs diff=lfs merge=lfs -text -*.rar filter=lfs diff=lfs merge=lfs -text -*.safetensors filter=lfs diff=lfs merge=lfs -text -saved_model/**/* filter=lfs diff=lfs merge=lfs -text -*.tar.* filter=lfs diff=lfs merge=lfs -text -*.tar filter=lfs diff=lfs merge=lfs -text -*.tflite filter=lfs diff=lfs merge=lfs -text -*.tgz filter=lfs diff=lfs merge=lfs -text -*.wasm filter=lfs diff=lfs merge=lfs -text -*.xz filter=lfs diff=lfs merge=lfs -text -*.zip filter=lfs diff=lfs merge=lfs -text -*.zst filter=lfs diff=lfs merge=lfs -text -*tfevents* filter=lfs diff=lfs merge=lfs -text +*.obj filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..d8721c3a30d6176f160d1dc59eb6ed2318f4b56b --- /dev/null +++ b/.gitignore @@ -0,0 +1,28 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*.so + +# Virtual environment +venv/ +env/ +.venv/ + +# Jupyter Notebook checkpoints +.ipynb_checkpoints/ + +# Logs and local environment files +*.log +*.env +.env.local + +# PyTorch or TensorFlow saved models +*.pt +*.pth +*.h5 + +# VSCode settings (if using VSCode) +.vscode/ + +# Hugging Face cache (optional) +~/.cache/huggingface/ diff --git a/README.md b/README.md index 44c0492bfd7c2d44a2ba2881e457de0bc0e66767..a93d6d587305dacfd5c07cc207022d6b834de7f0 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,24 @@ --- -title: Ditto Api -emoji: 🌖 -colorFrom: blue -colorTo: green +title: Ditto +emoji: 🐢 +colorFrom: yellow +colorTo: yellow sdk: gradio sdk_version: 5.17.1 app_file: app.py pinned: false license: mit -short_description: Api to generate 3D object out of an image +short_description: Image to 3D object generator --- Check out the configuration reference at https://huggingface.co./docs/hub/spaces-config-reference + +# Setup + +``` +uv pip compile requirements.txt -o requirements-uv.txt --index-strategy unsafe-best-match --no-build-isolation -p 3.10 + +pip install -r requirements.txt + +python setup.py install +``` \ No newline at end of file diff --git a/assets/shoes.png b/assets/shoes.png new file mode 100644 index 0000000000000000000000000000000000000000..f4f6ac504dfc20e1a12cb5da3b65c0c42ae80ea6 Binary files /dev/null and b/assets/shoes.png differ diff --git a/build/lib/hy3dgen/__init__.py b/build/lib/hy3dgen/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e307c3f8c1292da02f308e4b59ef0bcd6fe7305e --- /dev/null +++ b/build/lib/hy3dgen/__init__.py @@ -0,0 +1,23 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. diff --git a/build/lib/hy3dgen/rembg.py b/build/lib/hy3dgen/rembg.py new file mode 100644 index 0000000000000000000000000000000000000000..c0d99483c8354fc10c6689b5cf12ebcd44368d92 --- /dev/null +++ b/build/lib/hy3dgen/rembg.py @@ -0,0 +1,36 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + + +from PIL import Image +from rembg import remove, new_session + + +class BackgroundRemover(): + def __init__(self): + self.session = new_session() + + def __call__(self, image: Image.Image): + output = remove(image, session=self.session, bgcolor=[255, 255, 255, 0]) + return output diff --git a/build/lib/hy3dgen/shapegen/__init__.py b/build/lib/hy3dgen/shapegen/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d1f9534c15d029511d910d29e45da5ba7b8c8714 --- /dev/null +++ b/build/lib/hy3dgen/shapegen/__init__.py @@ -0,0 +1,27 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +from .pipelines import Hunyuan3DDiTPipeline, Hunyuan3DDiTFlowMatchingPipeline +from .postprocessors import FaceReducer, FloaterRemover, DegenerateFaceRemover +from .preprocessors import ImageProcessorV2, IMAGE_PROCESSORS, DEFAULT_IMAGEPROCESSOR diff --git a/build/lib/hy3dgen/shapegen/models/__init__.py b/build/lib/hy3dgen/shapegen/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..684b3e389737fb988f5e363e777c34f6cd1fe4ea --- /dev/null +++ b/build/lib/hy3dgen/shapegen/models/__init__.py @@ -0,0 +1,28 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + + +from .conditioner import DualImageEncoder, SingleImageEncoder, DinoImageEncoder, CLIPImageEncoder +from .hunyuan3ddit import Hunyuan3DDiT +from .vae import ShapeVAE diff --git a/build/lib/hy3dgen/shapegen/models/conditioner.py b/build/lib/hy3dgen/shapegen/models/conditioner.py new file mode 100644 index 0000000000000000000000000000000000000000..1af4c0cc440a193167c0837621c3494242b95f3d --- /dev/null +++ b/build/lib/hy3dgen/shapegen/models/conditioner.py @@ -0,0 +1,165 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import torch +import torch.nn as nn +from torchvision import transforms +from transformers import ( + CLIPVisionModelWithProjection, + CLIPVisionConfig, + Dinov2Model, + Dinov2Config, +) + + +class ImageEncoder(nn.Module): + def __init__( + self, + version=None, + config=None, + use_cls_token=True, + image_size=224, + **kwargs, + ): + super().__init__() + + if config is None: + self.model = self.MODEL_CLASS.from_pretrained(version) + else: + self.model = self.MODEL_CLASS(self.MODEL_CONFIG_CLASS.from_dict(config)) + self.model.eval() + self.model.requires_grad_(False) + self.use_cls_token = use_cls_token + self.size = image_size // 14 + self.num_patches = (image_size // 14) ** 2 + if self.use_cls_token: + self.num_patches += 1 + + self.transform = transforms.Compose( + [ + transforms.Resize(image_size, transforms.InterpolationMode.BILINEAR, antialias=True), + transforms.CenterCrop(image_size), + transforms.Normalize( + mean=self.mean, + std=self.std, + ), + ] + ) + + def forward(self, image, mask=None, value_range=(-1, 1)): + if value_range is not None: + low, high = value_range + image = (image - low) / (high - low) + + image = image.to(self.model.device, dtype=self.model.dtype) + inputs = self.transform(image) + outputs = self.model(inputs) + + last_hidden_state = outputs.last_hidden_state + if not self.use_cls_token: + last_hidden_state = last_hidden_state[:, 1:, :] + + return last_hidden_state + + def unconditional_embedding(self, batch_size): + device = next(self.model.parameters()).device + dtype = next(self.model.parameters()).dtype + zero = torch.zeros( + batch_size, + self.num_patches, + self.model.config.hidden_size, + device=device, + dtype=dtype, + ) + + return zero + + +class CLIPImageEncoder(ImageEncoder): + MODEL_CLASS = CLIPVisionModelWithProjection + MODEL_CONFIG_CLASS = CLIPVisionConfig + mean = [0.48145466, 0.4578275, 0.40821073] + std = [0.26862954, 0.26130258, 0.27577711] + + +class DinoImageEncoder(ImageEncoder): + MODEL_CLASS = Dinov2Model + MODEL_CONFIG_CLASS = Dinov2Config + mean = [0.485, 0.456, 0.406] + std = [0.229, 0.224, 0.225] + + +def build_image_encoder(config): + if config['type'] == 'CLIPImageEncoder': + return CLIPImageEncoder(**config['kwargs']) + elif config['type'] == 'DinoImageEncoder': + return DinoImageEncoder(**config['kwargs']) + else: + raise ValueError(f'Unknown image encoder type: {config["type"]}') + + +class DualImageEncoder(nn.Module): + def __init__( + self, + main_image_encoder, + additional_image_encoder, + ): + super().__init__() + self.main_image_encoder = build_image_encoder(main_image_encoder) + self.additional_image_encoder = build_image_encoder(additional_image_encoder) + + def forward(self, image, mask=None): + outputs = { + 'main': self.main_image_encoder(image, mask=mask), + 'additional': self.additional_image_encoder(image, mask=mask), + } + return outputs + + def unconditional_embedding(self, batch_size): + outputs = { + 'main': self.main_image_encoder.unconditional_embedding(batch_size), + 'additional': self.additional_image_encoder.unconditional_embedding(batch_size), + } + return outputs + + +class SingleImageEncoder(nn.Module): + def __init__( + self, + main_image_encoder, + ): + super().__init__() + self.main_image_encoder = build_image_encoder(main_image_encoder) + + def forward(self, image, mask=None): + outputs = { + 'main': self.main_image_encoder(image, mask=mask), + } + return outputs + + def unconditional_embedding(self, batch_size): + outputs = { + 'main': self.main_image_encoder.unconditional_embedding(batch_size), + } + return outputs diff --git a/build/lib/hy3dgen/shapegen/models/hunyuan3ddit.py b/build/lib/hy3dgen/shapegen/models/hunyuan3ddit.py new file mode 100644 index 0000000000000000000000000000000000000000..d1c778666890cb13538eba15460cf0c05c7f9130 --- /dev/null +++ b/build/lib/hy3dgen/shapegen/models/hunyuan3ddit.py @@ -0,0 +1,390 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import math +from dataclasses import dataclass +from typing import List, Tuple, Optional + +import torch +from einops import rearrange +from torch import Tensor, nn + + +def attention(q: Tensor, k: Tensor, v: Tensor, **kwargs) -> Tensor: + x = torch.nn.functional.scaled_dot_product_attention(q, k, v) + x = rearrange(x, "B H L D -> B L (H D)") + return x + + +def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 1000.0): + """ + Create sinusoidal timestep embeddings. + :param t: a 1-D Tensor of N indices, one per batch element. + These may be fractional. + :param dim: the dimension of the output. + :param max_period: controls the minimum frequency of the embeddings. + :return: an (N, D) Tensor of positional embeddings. + """ + t = time_factor * t + half = dim // 2 + freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to( + t.device + ) + + args = t[:, None].float() * freqs[None] + embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1) + if dim % 2: + embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1) + if torch.is_floating_point(t): + embedding = embedding.to(t) + return embedding + + +class MLPEmbedder(nn.Module): + def __init__(self, in_dim: int, hidden_dim: int): + super().__init__() + self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True) + self.silu = nn.SiLU() + self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True) + + def forward(self, x: Tensor) -> Tensor: + return self.out_layer(self.silu(self.in_layer(x))) + + +class RMSNorm(torch.nn.Module): + def __init__(self, dim: int): + super().__init__() + self.scale = nn.Parameter(torch.ones(dim)) + + def forward(self, x: Tensor): + x_dtype = x.dtype + x = x.float() + rrms = torch.rsqrt(torch.mean(x ** 2, dim=-1, keepdim=True) + 1e-6) + return (x * rrms).to(dtype=x_dtype) * self.scale + + +class QKNorm(torch.nn.Module): + def __init__(self, dim: int): + super().__init__() + self.query_norm = RMSNorm(dim) + self.key_norm = RMSNorm(dim) + + def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tuple[Tensor, Tensor]: + q = self.query_norm(q) + k = self.key_norm(k) + return q.to(v), k.to(v) + + +class SelfAttention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + ): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.norm = QKNorm(head_dim) + self.proj = nn.Linear(dim, dim) + + def forward(self, x: Tensor, pe: Tensor) -> Tensor: + qkv = self.qkv(x) + q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads) + q, k = self.norm(q, k, v) + x = attention(q, k, v, pe=pe) + x = self.proj(x) + return x + + +@dataclass +class ModulationOut: + shift: Tensor + scale: Tensor + gate: Tensor + + +class Modulation(nn.Module): + def __init__(self, dim: int, double: bool): + super().__init__() + self.is_double = double + self.multiplier = 6 if double else 3 + self.lin = nn.Linear(dim, self.multiplier * dim, bias=True) + + def forward(self, vec: Tensor) -> Tuple[ModulationOut, Optional[ModulationOut]]: + out = self.lin(nn.functional.silu(vec))[:, None, :] + out = out.chunk(self.multiplier, dim=-1) + + return ( + ModulationOut(*out[:3]), + ModulationOut(*out[3:]) if self.is_double else None, + ) + + +class DoubleStreamBlock(nn.Module): + def __init__( + self, + hidden_size: int, + num_heads: int, + mlp_ratio: float, + qkv_bias: bool = False, + ): + super().__init__() + mlp_hidden_dim = int(hidden_size * mlp_ratio) + self.num_heads = num_heads + self.hidden_size = hidden_size + self.img_mod = Modulation(hidden_size, double=True) + self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) + self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias) + + self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) + self.img_mlp = nn.Sequential( + nn.Linear(hidden_size, mlp_hidden_dim, bias=True), + nn.GELU(approximate="tanh"), + nn.Linear(mlp_hidden_dim, hidden_size, bias=True), + ) + + self.txt_mod = Modulation(hidden_size, double=True) + self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) + self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias) + + self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) + self.txt_mlp = nn.Sequential( + nn.Linear(hidden_size, mlp_hidden_dim, bias=True), + nn.GELU(approximate="tanh"), + nn.Linear(mlp_hidden_dim, hidden_size, bias=True), + ) + + def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor) -> Tuple[Tensor, Tensor]: + img_mod1, img_mod2 = self.img_mod(vec) + txt_mod1, txt_mod2 = self.txt_mod(vec) + + img_modulated = self.img_norm1(img) + img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift + img_qkv = self.img_attn.qkv(img_modulated) + img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads) + img_q, img_k = self.img_attn.norm(img_q, img_k, img_v) + + txt_modulated = self.txt_norm1(txt) + txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift + txt_qkv = self.txt_attn.qkv(txt_modulated) + txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads) + txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v) + + q = torch.cat((txt_q, img_q), dim=2) + k = torch.cat((txt_k, img_k), dim=2) + v = torch.cat((txt_v, img_v), dim=2) + + attn = attention(q, k, v, pe=pe) + txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1]:] + + img = img + img_mod1.gate * self.img_attn.proj(img_attn) + img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift) + + txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn) + txt = txt + txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift) + return img, txt + + +class SingleStreamBlock(nn.Module): + """ + A DiT block with parallel linear layers as described in + https://arxiv.org/abs/2302.05442 and adapted modulation interface. + """ + + def __init__( + self, + hidden_size: int, + num_heads: int, + mlp_ratio: float = 4.0, + qk_scale: Optional[float] = None, + ): + super().__init__() + + self.hidden_dim = hidden_size + self.num_heads = num_heads + head_dim = hidden_size // num_heads + self.scale = qk_scale or head_dim ** -0.5 + + self.mlp_hidden_dim = int(hidden_size * mlp_ratio) + # qkv and mlp_in + self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim) + # proj and mlp_out + self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size) + + self.norm = QKNorm(head_dim) + + self.hidden_size = hidden_size + self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) + + self.mlp_act = nn.GELU(approximate="tanh") + self.modulation = Modulation(hidden_size, double=False) + + def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor: + mod, _ = self.modulation(vec) + + x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift + qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1) + + q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads) + q, k = self.norm(q, k, v) + + # compute attention + attn = attention(q, k, v, pe=pe) + # compute activation in mlp stream, cat again and run second linear layer + output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2)) + return x + mod.gate * output + + +class LastLayer(nn.Module): + def __init__(self, hidden_size: int, patch_size: int, out_channels: int): + super().__init__() + self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) + self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True) + self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True)) + + def forward(self, x: Tensor, vec: Tensor) -> Tensor: + shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1) + x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :] + x = self.linear(x) + return x + + +class Hunyuan3DDiT(nn.Module): + def __init__( + self, + in_channels: int = 64, + context_in_dim: int = 1536, + hidden_size: int = 1024, + mlp_ratio: float = 4.0, + num_heads: int = 16, + depth: int = 16, + depth_single_blocks: int = 32, + axes_dim: List[int] = [64], + theta: int = 10_000, + qkv_bias: bool = True, + time_factor: float = 1000, + ckpt_path: Optional[str] = None, + **kwargs, + ): + super().__init__() + self.in_channels = in_channels + self.context_in_dim = context_in_dim + self.hidden_size = hidden_size + self.mlp_ratio = mlp_ratio + self.num_heads = num_heads + self.depth = depth + self.depth_single_blocks = depth_single_blocks + self.axes_dim = axes_dim + self.theta = theta + self.qkv_bias = qkv_bias + self.time_factor = time_factor + self.out_channels = self.in_channels + + if hidden_size % num_heads != 0: + raise ValueError( + f"Hidden size {hidden_size} must be divisible by num_heads {num_heads}" + ) + pe_dim = hidden_size // num_heads + if sum(axes_dim) != pe_dim: + raise ValueError(f"Got {axes_dim} but expected positional dim {pe_dim}") + self.hidden_size = hidden_size + self.num_heads = num_heads + self.latent_in = nn.Linear(self.in_channels, self.hidden_size, bias=True) + self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) + self.cond_in = nn.Linear(context_in_dim, self.hidden_size) + + self.double_blocks = nn.ModuleList( + [ + DoubleStreamBlock( + self.hidden_size, + self.num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + ) + for _ in range(depth) + ] + ) + + self.single_blocks = nn.ModuleList( + [ + SingleStreamBlock( + self.hidden_size, + self.num_heads, + mlp_ratio=mlp_ratio, + ) + for _ in range(depth_single_blocks) + ] + ) + + self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels) + + if ckpt_path is not None: + print('restored denoiser ckpt', ckpt_path) + + ckpt = torch.load(ckpt_path, map_location="cpu") + if 'state_dict' not in ckpt: + # deepspeed ckpt + state_dict = {} + for k in ckpt.keys(): + new_k = k.replace('_forward_module.', '') + state_dict[new_k] = ckpt[k] + else: + state_dict = ckpt["state_dict"] + + final_state_dict = {} + for k, v in state_dict.items(): + if k.startswith('model.'): + final_state_dict[k.replace('model.', '')] = v + else: + final_state_dict[k] = v + missing, unexpected = self.load_state_dict(final_state_dict, strict=False) + print('unexpected keys:', unexpected) + print('missing keys:', missing) + + def forward( + self, + x, + t, + contexts, + **kwargs, + ) -> Tensor: + cond = contexts['main'] + latent = self.latent_in(x) + vec = self.time_in(timestep_embedding(t, 256, self.time_factor).to(dtype=latent.dtype)) + cond = self.cond_in(cond) + pe = None + + for block in self.double_blocks: + latent, cond = block(img=latent, txt=cond, vec=vec, pe=pe) + + latent = torch.cat((cond, latent), 1) + for block in self.single_blocks: + latent = block(latent, vec=vec, pe=pe) + + latent = latent[:, cond.shape[1]:, ...] + latent = self.final_layer(latent, vec) + return latent diff --git a/build/lib/hy3dgen/shapegen/models/vae.py b/build/lib/hy3dgen/shapegen/models/vae.py new file mode 100644 index 0000000000000000000000000000000000000000..aef2784ac0db653714e711d12697eafc962c2aa3 --- /dev/null +++ b/build/lib/hy3dgen/shapegen/models/vae.py @@ -0,0 +1,636 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +from typing import Tuple, List, Union, Optional + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange, repeat +from skimage import measure +from tqdm import tqdm + + +class FourierEmbedder(nn.Module): + """The sin/cosine positional embedding. Given an input tensor `x` of shape [n_batch, ..., c_dim], it converts + each feature dimension of `x[..., i]` into: + [ + sin(x[..., i]), + sin(f_1*x[..., i]), + sin(f_2*x[..., i]), + ... + sin(f_N * x[..., i]), + cos(x[..., i]), + cos(f_1*x[..., i]), + cos(f_2*x[..., i]), + ... + cos(f_N * x[..., i]), + x[..., i] # only present if include_input is True. + ], here f_i is the frequency. + + Denote the space is [0 / num_freqs, 1 / num_freqs, 2 / num_freqs, 3 / num_freqs, ..., (num_freqs - 1) / num_freqs]. + If logspace is True, then the frequency f_i is [2^(0 / num_freqs), ..., 2^(i / num_freqs), ...]; + Otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1)]. + + Args: + num_freqs (int): the number of frequencies, default is 6; + logspace (bool): If logspace is True, then the frequency f_i is [..., 2^(i / num_freqs), ...], + otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1)]; + input_dim (int): the input dimension, default is 3; + include_input (bool): include the input tensor or not, default is True. + + Attributes: + frequencies (torch.Tensor): If logspace is True, then the frequency f_i is [..., 2^(i / num_freqs), ...], + otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1); + + out_dim (int): the embedding size, if include_input is True, it is input_dim * (num_freqs * 2 + 1), + otherwise, it is input_dim * num_freqs * 2. + + """ + + def __init__(self, + num_freqs: int = 6, + logspace: bool = True, + input_dim: int = 3, + include_input: bool = True, + include_pi: bool = True) -> None: + + """The initialization""" + + super().__init__() + + if logspace: + frequencies = 2.0 ** torch.arange( + num_freqs, + dtype=torch.float32 + ) + else: + frequencies = torch.linspace( + 1.0, + 2.0 ** (num_freqs - 1), + num_freqs, + dtype=torch.float32 + ) + + if include_pi: + frequencies *= torch.pi + + self.register_buffer("frequencies", frequencies, persistent=False) + self.include_input = include_input + self.num_freqs = num_freqs + + self.out_dim = self.get_dims(input_dim) + + def get_dims(self, input_dim): + temp = 1 if self.include_input or self.num_freqs == 0 else 0 + out_dim = input_dim * (self.num_freqs * 2 + temp) + + return out_dim + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ Forward process. + + Args: + x: tensor of shape [..., dim] + + Returns: + embedding: an embedding of `x` of shape [..., dim * (num_freqs * 2 + temp)] + where temp is 1 if include_input is True and 0 otherwise. + """ + + if self.num_freqs > 0: + embed = (x[..., None].contiguous() * self.frequencies).view(*x.shape[:-1], -1) + if self.include_input: + return torch.cat((x, embed.sin(), embed.cos()), dim=-1) + else: + return torch.cat((embed.sin(), embed.cos()), dim=-1) + else: + return x + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + + def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + self.scale_by_keep = scale_by_keep + + def forward(self, x): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + This is the same as the DropConnect impl I created for EfficientNet, etc networks, however, + the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for + changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use + 'survival rate' as the argument. + + """ + if self.drop_prob == 0. or not self.training: + return x + keep_prob = 1 - self.drop_prob + shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = x.new_empty(shape).bernoulli_(keep_prob) + if keep_prob > 0.0 and self.scale_by_keep: + random_tensor.div_(keep_prob) + return x * random_tensor + + def extra_repr(self): + return f'drop_prob={round(self.drop_prob, 3):0.3f}' + + +class MLP(nn.Module): + def __init__( + self, *, + width: int, + output_width: int = None, + drop_path_rate: float = 0.0 + ): + super().__init__() + self.width = width + self.c_fc = nn.Linear(width, width * 4) + self.c_proj = nn.Linear(width * 4, output_width if output_width is not None else width) + self.gelu = nn.GELU() + self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity() + + def forward(self, x): + return self.drop_path(self.c_proj(self.gelu(self.c_fc(x)))) + + +class QKVMultiheadCrossAttention(nn.Module): + def __init__( + self, + *, + heads: int, + n_data: Optional[int] = None, + width=None, + qk_norm=False, + norm_layer=nn.LayerNorm + ): + super().__init__() + self.heads = heads + self.n_data = n_data + self.q_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity() + self.k_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity() + + def forward(self, q, kv): + _, n_ctx, _ = q.shape + bs, n_data, width = kv.shape + attn_ch = width // self.heads // 2 + q = q.view(bs, n_ctx, self.heads, -1) + kv = kv.view(bs, n_data, self.heads, -1) + k, v = torch.split(kv, attn_ch, dim=-1) + + q = self.q_norm(q) + k = self.k_norm(k) + + q, k, v = map(lambda t: rearrange(t, 'b n h d -> b h n d', h=self.heads), (q, k, v)) + out = F.scaled_dot_product_attention(q, k, v).transpose(1, 2).reshape(bs, n_ctx, -1) + + return out + + +class MultiheadCrossAttention(nn.Module): + def __init__( + self, + *, + width: int, + heads: int, + qkv_bias: bool = True, + n_data: Optional[int] = None, + data_width: Optional[int] = None, + norm_layer=nn.LayerNorm, + qk_norm: bool = False + ): + super().__init__() + self.n_data = n_data + self.width = width + self.heads = heads + self.data_width = width if data_width is None else data_width + self.c_q = nn.Linear(width, width, bias=qkv_bias) + self.c_kv = nn.Linear(self.data_width, width * 2, bias=qkv_bias) + self.c_proj = nn.Linear(width, width) + self.attention = QKVMultiheadCrossAttention( + heads=heads, + n_data=n_data, + width=width, + norm_layer=norm_layer, + qk_norm=qk_norm + ) + + def forward(self, x, data): + x = self.c_q(x) + data = self.c_kv(data) + x = self.attention(x, data) + x = self.c_proj(x) + return x + + +class ResidualCrossAttentionBlock(nn.Module): + def __init__( + self, + *, + n_data: Optional[int] = None, + width: int, + heads: int, + data_width: Optional[int] = None, + qkv_bias: bool = True, + norm_layer=nn.LayerNorm, + qk_norm: bool = False + ): + super().__init__() + + if data_width is None: + data_width = width + + self.attn = MultiheadCrossAttention( + n_data=n_data, + width=width, + heads=heads, + data_width=data_width, + qkv_bias=qkv_bias, + norm_layer=norm_layer, + qk_norm=qk_norm + ) + self.ln_1 = norm_layer(width, elementwise_affine=True, eps=1e-6) + self.ln_2 = norm_layer(data_width, elementwise_affine=True, eps=1e-6) + self.ln_3 = norm_layer(width, elementwise_affine=True, eps=1e-6) + self.mlp = MLP(width=width) + + def forward(self, x: torch.Tensor, data: torch.Tensor): + x = x + self.attn(self.ln_1(x), self.ln_2(data)) + x = x + self.mlp(self.ln_3(x)) + return x + + +class QKVMultiheadAttention(nn.Module): + def __init__( + self, + *, + heads: int, + n_ctx: int, + width=None, + qk_norm=False, + norm_layer=nn.LayerNorm + ): + super().__init__() + self.heads = heads + self.n_ctx = n_ctx + self.q_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity() + self.k_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity() + + def forward(self, qkv): + bs, n_ctx, width = qkv.shape + attn_ch = width // self.heads // 3 + qkv = qkv.view(bs, n_ctx, self.heads, -1) + q, k, v = torch.split(qkv, attn_ch, dim=-1) + + q = self.q_norm(q) + k = self.k_norm(k) + + q, k, v = map(lambda t: rearrange(t, 'b n h d -> b h n d', h=self.heads), (q, k, v)) + out = F.scaled_dot_product_attention(q, k, v).transpose(1, 2).reshape(bs, n_ctx, -1) + return out + + +class MultiheadAttention(nn.Module): + def __init__( + self, + *, + n_ctx: int, + width: int, + heads: int, + qkv_bias: bool, + norm_layer=nn.LayerNorm, + qk_norm: bool = False, + drop_path_rate: float = 0.0 + ): + super().__init__() + self.n_ctx = n_ctx + self.width = width + self.heads = heads + self.c_qkv = nn.Linear(width, width * 3, bias=qkv_bias) + self.c_proj = nn.Linear(width, width) + self.attention = QKVMultiheadAttention( + heads=heads, + n_ctx=n_ctx, + width=width, + norm_layer=norm_layer, + qk_norm=qk_norm + ) + self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity() + + def forward(self, x): + x = self.c_qkv(x) + x = self.attention(x) + x = self.drop_path(self.c_proj(x)) + return x + + +class ResidualAttentionBlock(nn.Module): + def __init__( + self, + *, + n_ctx: int, + width: int, + heads: int, + qkv_bias: bool = True, + norm_layer=nn.LayerNorm, + qk_norm: bool = False, + drop_path_rate: float = 0.0, + ): + super().__init__() + self.attn = MultiheadAttention( + n_ctx=n_ctx, + width=width, + heads=heads, + qkv_bias=qkv_bias, + norm_layer=norm_layer, + qk_norm=qk_norm, + drop_path_rate=drop_path_rate + ) + self.ln_1 = norm_layer(width, elementwise_affine=True, eps=1e-6) + self.mlp = MLP(width=width, drop_path_rate=drop_path_rate) + self.ln_2 = norm_layer(width, elementwise_affine=True, eps=1e-6) + + def forward(self, x: torch.Tensor): + x = x + self.attn(self.ln_1(x)) + x = x + self.mlp(self.ln_2(x)) + return x + + +class Transformer(nn.Module): + def __init__( + self, + *, + n_ctx: int, + width: int, + layers: int, + heads: int, + qkv_bias: bool = True, + norm_layer=nn.LayerNorm, + qk_norm: bool = False, + drop_path_rate: float = 0.0 + ): + super().__init__() + self.n_ctx = n_ctx + self.width = width + self.layers = layers + self.resblocks = nn.ModuleList( + [ + ResidualAttentionBlock( + n_ctx=n_ctx, + width=width, + heads=heads, + qkv_bias=qkv_bias, + norm_layer=norm_layer, + qk_norm=qk_norm, + drop_path_rate=drop_path_rate + ) + for _ in range(layers) + ] + ) + + def forward(self, x: torch.Tensor): + for block in self.resblocks: + x = block(x) + return x + + +class CrossAttentionDecoder(nn.Module): + + def __init__( + self, + *, + num_latents: int, + out_channels: int, + fourier_embedder: FourierEmbedder, + width: int, + heads: int, + qkv_bias: bool = True, + qk_norm: bool = False, + label_type: str = "binary" + ): + super().__init__() + + self.fourier_embedder = fourier_embedder + + self.query_proj = nn.Linear(self.fourier_embedder.out_dim, width) + + self.cross_attn_decoder = ResidualCrossAttentionBlock( + n_data=num_latents, + width=width, + heads=heads, + qkv_bias=qkv_bias, + qk_norm=qk_norm + ) + + self.ln_post = nn.LayerNorm(width) + self.output_proj = nn.Linear(width, out_channels) + self.label_type = label_type + + def forward(self, queries: torch.FloatTensor, latents: torch.FloatTensor): + queries = self.query_proj(self.fourier_embedder(queries).to(latents.dtype)) + x = self.cross_attn_decoder(queries, latents) + x = self.ln_post(x) + occ = self.output_proj(x) + return occ + + +def generate_dense_grid_points(bbox_min: np.ndarray, + bbox_max: np.ndarray, + octree_depth: int, + indexing: str = "ij", + octree_resolution: int = None, + ): + length = bbox_max - bbox_min + num_cells = np.exp2(octree_depth) + if octree_resolution is not None: + num_cells = octree_resolution + + x = np.linspace(bbox_min[0], bbox_max[0], int(num_cells) + 1, dtype=np.float32) + y = np.linspace(bbox_min[1], bbox_max[1], int(num_cells) + 1, dtype=np.float32) + z = np.linspace(bbox_min[2], bbox_max[2], int(num_cells) + 1, dtype=np.float32) + [xs, ys, zs] = np.meshgrid(x, y, z, indexing=indexing) + xyz = np.stack((xs, ys, zs), axis=-1) + xyz = xyz.reshape(-1, 3) + grid_size = [int(num_cells) + 1, int(num_cells) + 1, int(num_cells) + 1] + + return xyz, grid_size, length + + +def center_vertices(vertices): + """Translate the vertices so that bounding box is centered at zero.""" + vert_min = vertices.min(dim=0)[0] + vert_max = vertices.max(dim=0)[0] + vert_center = 0.5 * (vert_min + vert_max) + return vertices - vert_center + + +class Latent2MeshOutput: + + def __init__(self, mesh_v=None, mesh_f=None): + self.mesh_v = mesh_v + self.mesh_f = mesh_f + + +class ShapeVAE(nn.Module): + def __init__( + self, + *, + num_latents: int, + embed_dim: int, + width: int, + heads: int, + num_decoder_layers: int, + num_freqs: int = 8, + include_pi: bool = True, + qkv_bias: bool = True, + qk_norm: bool = False, + label_type: str = "binary", + drop_path_rate: float = 0.0, + scale_factor: float = 1.0, + ): + super().__init__() + self.fourier_embedder = FourierEmbedder(num_freqs=num_freqs, include_pi=include_pi) + + self.post_kl = nn.Linear(embed_dim, width) + + self.transformer = Transformer( + n_ctx=num_latents, + width=width, + layers=num_decoder_layers, + heads=heads, + qkv_bias=qkv_bias, + qk_norm=qk_norm, + drop_path_rate=drop_path_rate + ) + + self.geo_decoder = CrossAttentionDecoder( + fourier_embedder=self.fourier_embedder, + out_channels=1, + num_latents=num_latents, + width=width, + heads=heads, + qkv_bias=qkv_bias, + qk_norm=qk_norm, + label_type=label_type, + ) + + self.scale_factor = scale_factor + self.latent_shape = (num_latents, embed_dim) + + def forward(self, latents): + latents = self.post_kl(latents) + latents = self.transformer(latents) + return latents + + @torch.no_grad() + def latents2mesh( + self, + latents: torch.FloatTensor, + bounds: Union[Tuple[float], List[float], float] = 1.1, + octree_depth: int = 7, + num_chunks: int = 10000, + mc_level: float = -1 / 512, + octree_resolution: int = None, + mc_algo: str = 'dmc', + ): + device = latents.device + + # 1. generate query points + if isinstance(bounds, float): + bounds = [-bounds, -bounds, -bounds, bounds, bounds, bounds] + bbox_min = np.array(bounds[0:3]) + bbox_max = np.array(bounds[3:6]) + bbox_size = bbox_max - bbox_min + xyz_samples, grid_size, length = generate_dense_grid_points( + bbox_min=bbox_min, + bbox_max=bbox_max, + octree_depth=octree_depth, + octree_resolution=octree_resolution, + indexing="ij" + ) + xyz_samples = torch.FloatTensor(xyz_samples) + + # 2. latents to 3d volume + batch_logits = [] + batch_size = latents.shape[0] + for start in tqdm(range(0, xyz_samples.shape[0], num_chunks), + desc=f"MC Level {mc_level} Implicit Function:"): + queries = xyz_samples[start: start + num_chunks, :].to(device) + queries = queries.half() + batch_queries = repeat(queries, "p c -> b p c", b=batch_size) + + logits = self.geo_decoder(batch_queries.to(latents.dtype), latents) + if mc_level == -1: + mc_level = 0 + logits = torch.sigmoid(logits) * 2 - 1 + print(f'Training with soft labels, inference with sigmoid and marching cubes level 0.') + batch_logits.append(logits) + grid_logits = torch.cat(batch_logits, dim=1) + grid_logits = grid_logits.view((batch_size, grid_size[0], grid_size[1], grid_size[2])).float() + + # 3. extract surface + outputs = [] + for i in range(batch_size): + try: + if mc_algo == 'mc': + vertices, faces, normals, _ = measure.marching_cubes( + grid_logits[i].cpu().numpy(), + mc_level, + method="lewiner" + ) + vertices = vertices / grid_size * bbox_size + bbox_min + elif mc_algo == 'dmc': + if not hasattr(self, 'dmc'): + try: + from diso import DiffDMC + except: + raise ImportError("Please install diso via `pip install diso`, or set mc_algo to 'mc'") + self.dmc = DiffDMC(dtype=torch.float32).to(device) + octree_resolution = 2 ** octree_depth if octree_resolution is None else octree_resolution + sdf = -grid_logits[i] / octree_resolution + verts, faces = self.dmc(sdf, deform=None, return_quads=False, normalize=True) + verts = center_vertices(verts) + vertices = verts.detach().cpu().numpy() + faces = faces.detach().cpu().numpy()[:, ::-1] + else: + raise ValueError(f"mc_algo {mc_algo} not supported.") + + outputs.append( + Latent2MeshOutput( + mesh_v=vertices.astype(np.float32), + mesh_f=np.ascontiguousarray(faces) + ) + ) + + except ValueError: + outputs.append(None) + except RuntimeError: + outputs.append(None) + + return outputs diff --git a/build/lib/hy3dgen/shapegen/pipelines.py b/build/lib/hy3dgen/shapegen/pipelines.py new file mode 100644 index 0000000000000000000000000000000000000000..e4f10a49c4c153121a9b581d20f9e36a0f168499 --- /dev/null +++ b/build/lib/hy3dgen/shapegen/pipelines.py @@ -0,0 +1,589 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import copy +import importlib +import inspect +import logging +import os +from typing import List, Optional, Union + +import numpy as np +import torch +import trimesh +import yaml +from PIL import Image +from diffusers.utils.torch_utils import randn_tensor +from tqdm import tqdm + +logger = logging.getLogger(__name__) + + +def retrieve_timesteps( + scheduler, + num_inference_steps: Optional[int] = None, + device: Optional[Union[str, torch.device]] = None, + timesteps: Optional[List[int]] = None, + sigmas: Optional[List[float]] = None, + **kwargs, +): + """ + Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles + custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. + + Args: + scheduler (`SchedulerMixin`): + The scheduler to get timesteps from. + num_inference_steps (`int`): + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. + device (`str` or `torch.device`, *optional*): + The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. + timesteps (`List[int]`, *optional*): + Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, + `num_inference_steps` and `sigmas` must be `None`. + sigmas (`List[float]`, *optional*): + Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, + `num_inference_steps` and `timesteps` must be `None`. + + Returns: + `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the + second element is the number of inference steps. + """ + if timesteps is not None and sigmas is not None: + raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") + if timesteps is not None: + accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accepts_timesteps: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" timestep schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + elif sigmas is not None: + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accept_sigmas: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" sigmas schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + else: + scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) + timesteps = scheduler.timesteps + return timesteps, num_inference_steps + + +def export_to_trimesh(mesh_output): + if isinstance(mesh_output, list): + outputs = [] + for mesh in mesh_output: + if mesh is None: + outputs.append(None) + else: + mesh.mesh_f = mesh.mesh_f[:, ::-1] + mesh_output = trimesh.Trimesh(mesh.mesh_v, mesh.mesh_f) + outputs.append(mesh_output) + return outputs + else: + mesh_output.mesh_f = mesh_output.mesh_f[:, ::-1] + mesh_output = trimesh.Trimesh(mesh_output.mesh_v, mesh_output.mesh_f) + return mesh_output + + +def get_obj_from_str(string, reload=False): + module, cls = string.rsplit(".", 1) + if reload: + module_imp = importlib.import_module(module) + importlib.reload(module_imp) + return getattr(importlib.import_module(module, package=None), cls) + + +def instantiate_from_config(config, **kwargs): + if "target" not in config: + raise KeyError("Expected key `target` to instantiate.") + cls = get_obj_from_str(config["target"]) + params = config.get("params", dict()) + kwargs.update(params) + instance = cls(**kwargs) + return instance + + +class Hunyuan3DDiTPipeline: + @classmethod + def from_single_file( + cls, + ckpt_path, + config_path, + device='cpu', + dtype=torch.float16, + **kwargs, + ): + # load config + with open(config_path, 'r') as f: + config = yaml.safe_load(f) + + # load ckpt + if not os.path.exists(ckpt_path): + raise FileNotFoundError(f"Model file {ckpt_path} not found") + logger.info(f"Loading model from {ckpt_path}") + + if ckpt_path.endswith('.safetensors'): + # parse safetensors + import safetensors.torch + safetensors_ckpt = safetensors.torch.load_file(ckpt_path, device='cpu') + ckpt = {} + for key, value in safetensors_ckpt.items(): + model_name = key.split('.')[0] + new_key = key[len(model_name) + 1:] + if model_name not in ckpt: + ckpt[model_name] = {} + ckpt[model_name][new_key] = value + else: + ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=True) + + # load model + from accelerate import init_empty_weights + with init_empty_weights(): + model = instantiate_from_config(config['model']) + vae = instantiate_from_config(config['vae']) + conditioner = instantiate_from_config(config['conditioner']) + image_processor = instantiate_from_config(config['image_processor']) + scheduler = instantiate_from_config(config['scheduler']) + + model.load_state_dict(ckpt['model'], assign = True) + vae.load_state_dict(ckpt['vae'], assign = True) + if 'conditioner' in ckpt: + conditioner.load_state_dict(ckpt['conditioner'], assign = True) + + model_kwargs = dict( + vae=vae, + model=model, + scheduler=scheduler, + conditioner=conditioner, + image_processor=image_processor, + device=device, + dtype=dtype, + ) + model_kwargs.update(kwargs) + + return cls( + **model_kwargs + ) + + @classmethod + def from_pretrained( + cls, + model_path, + device='cuda', + dtype=torch.float16, + use_safetensors=None, + variant=None, + subfolder='hunyuan3d-dit-v2-0', + **kwargs, + ): + original_model_path = model_path + if not os.path.exists(model_path): + # try local path + base_dir = os.environ.get('HY3DGEN_MODELS', '~/.cache/hy3dgen') + model_path = os.path.expanduser(os.path.join(base_dir, model_path, subfolder)) + if not os.path.exists(model_path): + try: + import huggingface_hub + # download from huggingface + path = huggingface_hub.snapshot_download(repo_id=original_model_path) + model_path = os.path.join(path, subfolder) + except ImportError: + logger.warning( + "You need to install HuggingFace Hub to load models from the hub." + ) + raise RuntimeError(f"Model path {model_path} not found") + if not os.path.exists(model_path): + raise FileNotFoundError(f"Model path {original_model_path} not found") + + extension = 'ckpt' if not use_safetensors else 'safetensors' + variant = '' if variant is None else f'.{variant}' + ckpt_name = f'model{variant}.{extension}' + config_path = os.path.join(model_path, 'config.yaml') + ckpt_path = os.path.join(model_path, ckpt_name) + + return cls.from_single_file( + ckpt_path, + config_path, + device=device, + dtype=dtype, + use_safetensors=use_safetensors, + variant=variant, + **kwargs + ) + + def __init__( + self, + vae, + model, + scheduler, + conditioner, + image_processor, + device='cuda', + dtype=torch.float16, + **kwargs + ): + self.vae = vae + self.model = model + self.scheduler = scheduler + self.conditioner = conditioner + self.image_processor = image_processor + + self.to(device, dtype) + + def to(self, device=None, dtype=None): + if device is not None: + self.device = torch.device(device) + self.vae.to(device) + self.model.to(device) + self.conditioner.to(device) + if dtype is not None: + self.dtype = dtype + self.vae.to(dtype=dtype) + self.model.to(dtype=dtype) + self.conditioner.to(dtype=dtype) + + def encode_cond(self, image, mask, do_classifier_free_guidance, dual_guidance): + bsz = image.shape[0] + cond = self.conditioner(image=image, mask=mask) + + if do_classifier_free_guidance: + un_cond = self.conditioner.unconditional_embedding(bsz) + + if dual_guidance: + un_cond_drop_main = copy.deepcopy(un_cond) + un_cond_drop_main['additional'] = cond['additional'] + + def cat_recursive(a, b, c): + if isinstance(a, torch.Tensor): + return torch.cat([a, b, c], dim=0).to(self.dtype) + out = {} + for k in a.keys(): + out[k] = cat_recursive(a[k], b[k], c[k]) + return out + + cond = cat_recursive(cond, un_cond_drop_main, un_cond) + else: + un_cond = self.conditioner.unconditional_embedding(bsz) + + def cat_recursive(a, b): + if isinstance(a, torch.Tensor): + return torch.cat([a, b], dim=0).to(self.dtype) + out = {} + for k in a.keys(): + out[k] = cat_recursive(a[k], b[k]) + return out + + cond = cat_recursive(cond, un_cond) + return cond + + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs + + def prepare_latents(self, batch_size, dtype, device, generator, latents=None): + shape = (batch_size, *self.vae.latent_shape) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + if latents is None: + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + else: + latents = latents.to(device) + + # scale the initial noise by the standard deviation required by the scheduler + latents = latents * getattr(self.scheduler, 'init_noise_sigma', 1.0) + return latents + + def prepare_image(self, image): + if isinstance(image, str) and not os.path.exists(image): + raise FileNotFoundError(f"Couldn't find image at path {image}") + + if not isinstance(image, list): + image = [image] + image_pts = [] + mask_pts = [] + for img in image: + image_pt, mask_pt = self.image_processor(img, return_mask=True) + image_pts.append(image_pt) + mask_pts.append(mask_pt) + + image_pts = torch.cat(image_pts, dim=0).to(self.device, dtype=self.dtype) + if mask_pts[0] is not None: + mask_pts = torch.cat(mask_pts, dim=0).to(self.device, dtype=self.dtype) + else: + mask_pts = None + return image_pts, mask_pts + + def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32): + """ + See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 + + Args: + timesteps (`torch.Tensor`): + generate embedding vectors at these timesteps + embedding_dim (`int`, *optional*, defaults to 512): + dimension of the embeddings to generate + dtype: + data type of the generated embeddings + + Returns: + `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + """ + assert len(w.shape) == 1 + w = w * 1000.0 + + half_dim = embedding_dim // 2 + emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1) + emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb) + emb = w.to(dtype)[:, None] * emb[None, :] + emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1) + if embedding_dim % 2 == 1: # zero pad + emb = torch.nn.functional.pad(emb, (0, 1)) + assert emb.shape == (w.shape[0], embedding_dim) + return emb + + @torch.no_grad() + def __call__( + self, + image: Union[str, List[str], Image.Image] = None, + num_inference_steps: int = 50, + timesteps: List[int] = None, + sigmas: List[float] = None, + eta: float = 0.0, + guidance_scale: float = 7.5, + dual_guidance_scale: float = 10.5, + dual_guidance: bool = True, + generator=None, + box_v=1.01, + octree_resolution=384, + mc_level=-1 / 512, + num_chunks=8000, + mc_algo='mc', + output_type: Optional[str] = "trimesh", + enable_pbar=True, + **kwargs, + ) -> List[List[trimesh.Trimesh]]: + callback = kwargs.pop("callback", None) + callback_steps = kwargs.pop("callback_steps", None) + + device = self.device + dtype = self.dtype + do_classifier_free_guidance = guidance_scale >= 0 and \ + getattr(self.model, 'guidance_cond_proj_dim', None) is None + dual_guidance = dual_guidance_scale >= 0 and dual_guidance + + image, mask = self.prepare_image(image) + cond = self.encode_cond(image=image, + mask=mask, + do_classifier_free_guidance=do_classifier_free_guidance, + dual_guidance=dual_guidance) + batch_size = image.shape[0] + + t_dtype = torch.long + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, num_inference_steps, device, timesteps, sigmas) + + latents = self.prepare_latents(batch_size, dtype, device, generator) + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + guidance_cond = None + if getattr(self.model, 'guidance_cond_proj_dim', None) is not None: + print('Using lcm guidance scale') + guidance_scale_tensor = torch.tensor(guidance_scale - 1).repeat(batch_size) + guidance_cond = self.get_guidance_scale_embedding( + guidance_scale_tensor, embedding_dim=self.model.guidance_cond_proj_dim + ).to(device=device, dtype=latents.dtype) + + for i, t in enumerate(tqdm(timesteps, disable=not enable_pbar, desc="Diffusion Sampling:", leave=False)): + # expand the latents if we are doing classifier free guidance + if do_classifier_free_guidance: + latent_model_input = torch.cat([latents] * (3 if dual_guidance else 2)) + else: + latent_model_input = latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + + # predict the noise residual + timestep_tensor = torch.tensor([t], dtype=t_dtype, device=device) + timestep_tensor = timestep_tensor.expand(latent_model_input.shape[0]) + noise_pred = self.model(latent_model_input, timestep_tensor, cond, guidance_cond=guidance_cond) + + # no drop, drop clip, all drop + if do_classifier_free_guidance: + if dual_guidance: + noise_pred_clip, noise_pred_dino, noise_pred_uncond = noise_pred.chunk(3) + noise_pred = ( + noise_pred_uncond + + guidance_scale * (noise_pred_clip - noise_pred_dino) + + dual_guidance_scale * (noise_pred_dino - noise_pred_uncond) + ) + else: + noise_pred_cond, noise_pred_uncond = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond) + + # compute the previous noisy sample x_t -> x_t-1 + outputs = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs) + latents = outputs.prev_sample + + if callback is not None and i % callback_steps == 0: + step_idx = i // getattr(self.scheduler, "order", 1) + callback(step_idx, t, outputs) + + return self._export( + latents, + output_type, + box_v, mc_level, num_chunks, octree_resolution, mc_algo, + ) + + def _export(self, latents, output_type, box_v, mc_level, num_chunks, octree_resolution, mc_algo): + if not output_type == "latent": + latents = 1. / self.vae.scale_factor * latents + latents = self.vae(latents) + outputs = self.vae.latents2mesh( + latents, + bounds=box_v, + mc_level=mc_level, + num_chunks=num_chunks, + octree_resolution=octree_resolution, + mc_algo=mc_algo, + ) + else: + outputs = latents + + if output_type == 'trimesh': + outputs = export_to_trimesh(outputs) + + return outputs + + +class Hunyuan3DDiTFlowMatchingPipeline(Hunyuan3DDiTPipeline): + + @torch.no_grad() + def __call__( + self, + image: Union[str, List[str], Image.Image] = None, + num_inference_steps: int = 50, + timesteps: List[int] = None, + sigmas: List[float] = None, + eta: float = 0.0, + guidance_scale: float = 7.5, + generator=None, + box_v=1.01, + octree_resolution=384, + mc_level=0.0, + mc_algo='mc', + num_chunks=8000, + output_type: Optional[str] = "trimesh", + enable_pbar=True, + **kwargs, + ) -> List[List[trimesh.Trimesh]]: + callback = kwargs.pop("callback", None) + callback_steps = kwargs.pop("callback_steps", None) + + device = self.device + dtype = self.dtype + do_classifier_free_guidance = guidance_scale >= 0 and not ( + hasattr(self.model, 'guidance_embed') and + self.model.guidance_embed is True + ) + + image, mask = self.prepare_image(image) + cond = self.encode_cond( + image=image, + mask=mask, + do_classifier_free_guidance=do_classifier_free_guidance, + dual_guidance=False, + ) + batch_size = image.shape[0] + + # 5. Prepare timesteps + # NOTE: this is slightly different from common usage, we start from 0. + sigmas = np.linspace(0, 1, num_inference_steps) if sigmas is None else sigmas + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, + num_inference_steps, + device, + sigmas=sigmas, + ) + latents = self.prepare_latents(batch_size, dtype, device, generator) + + guidance = None + if hasattr(self.model, 'guidance_embed') and \ + self.model.guidance_embed is True: + guidance = torch.tensor([guidance_scale] * batch_size, device=device, dtype=dtype) + + for i, t in enumerate(tqdm(timesteps, disable=not enable_pbar, desc="Diffusion Sampling:")): + # expand the latents if we are doing classifier free guidance + if do_classifier_free_guidance: + latent_model_input = torch.cat([latents] * 2) + else: + latent_model_input = latents + + # NOTE: we assume model get timesteps ranged from 0 to 1 + timestep = t.expand(latent_model_input.shape[0]).to( + latents.dtype) / self.scheduler.config.num_train_timesteps + noise_pred = self.model(latent_model_input, timestep, cond, guidance=guidance) + + if do_classifier_free_guidance: + noise_pred_cond, noise_pred_uncond = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond) + + # compute the previous noisy sample x_t -> x_t-1 + outputs = self.scheduler.step(noise_pred, t, latents) + latents = outputs.prev_sample + + if callback is not None and i % callback_steps == 0: + step_idx = i // getattr(self.scheduler, "order", 1) + callback(step_idx, t, outputs) + + return self._export( + latents, + output_type, + box_v, mc_level, num_chunks, octree_resolution, mc_algo, + ) diff --git a/build/lib/hy3dgen/shapegen/postprocessors.py b/build/lib/hy3dgen/shapegen/postprocessors.py new file mode 100644 index 0000000000000000000000000000000000000000..0500fa2d8f70a3a933f8313d11126ad9b27bf57c --- /dev/null +++ b/build/lib/hy3dgen/shapegen/postprocessors.py @@ -0,0 +1,175 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import os +import tempfile +from typing import Union + +import pymeshlab +import trimesh + +from .models.vae import Latent2MeshOutput + + +def load_mesh(path): + if path.endswith(".glb"): + mesh = trimesh.load(path) + else: + mesh = pymeshlab.MeshSet() + mesh.load_new_mesh(path) + return mesh + + +def reduce_face(mesh: pymeshlab.MeshSet, max_facenum: int = 200000): + mesh.apply_filter( + "meshing_decimation_quadric_edge_collapse", + targetfacenum=max_facenum, + qualitythr=1.0, + preserveboundary=True, + boundaryweight=3, + preservenormal=True, + preservetopology=True, + autoclean=True + ) + return mesh + + +def remove_floater(mesh: pymeshlab.MeshSet): + mesh.apply_filter("compute_selection_by_small_disconnected_components_per_face", + nbfaceratio=0.005) + mesh.apply_filter("compute_selection_transfer_face_to_vertex", inclusive=False) + mesh.apply_filter("meshing_remove_selected_vertices_and_faces") + return mesh + + +def pymeshlab2trimesh(mesh: pymeshlab.MeshSet): + temp_file = tempfile.NamedTemporaryFile(suffix='.ply', delete=True) + temp_file.close() + temp_file_name = temp_file.name + + mesh.save_current_mesh(temp_file_name) + mesh = trimesh.load(temp_file_name) + if os.path.exists(temp_file_name): + os.remove(temp_file_name) + + # 检查加载的对象类型 + if isinstance(mesh, trimesh.Scene): + combined_mesh = trimesh.Trimesh() + # 如果是Scene,遍历所有的geometry并合并 + for geom in mesh.geometry.values(): + combined_mesh = trimesh.util.concatenate([combined_mesh, geom]) + mesh = combined_mesh + return mesh + + +def trimesh2pymeshlab(mesh: trimesh.Trimesh): + temp_file = tempfile.NamedTemporaryFile(suffix='.ply', delete=True) + temp_file.close() + temp_file_name = temp_file.name + + if isinstance(mesh, trimesh.scene.Scene): + for idx, obj in enumerate(mesh.geometry.values()): + if idx == 0: + temp_mesh = obj + else: + temp_mesh = temp_mesh + obj + mesh = temp_mesh + mesh.export(temp_file_name) + mesh = pymeshlab.MeshSet() + mesh.load_new_mesh(temp_file_name) + if os.path.exists(temp_file_name): + os.remove(temp_file_name) + + return mesh + + +def export_mesh(input, output): + if isinstance(input, pymeshlab.MeshSet): + mesh = output + elif isinstance(input, Latent2MeshOutput): + output = Latent2MeshOutput() + output.mesh_v = output.current_mesh().vertex_matrix() + output.mesh_f = output.current_mesh().face_matrix() + mesh = output + else: + mesh = pymeshlab2trimesh(output) + return mesh + + +def import_mesh(mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str]) -> pymeshlab.MeshSet: + if isinstance(mesh, str): + mesh = load_mesh(mesh) + elif isinstance(mesh, Latent2MeshOutput): + mesh = pymeshlab.MeshSet() + mesh_pymeshlab = pymeshlab.Mesh(vertex_matrix=mesh.mesh_v, face_matrix=mesh.mesh_f) + mesh.add_mesh(mesh_pymeshlab, "converted_mesh") + + if isinstance(mesh, (trimesh.Trimesh, trimesh.scene.Scene)): + mesh = trimesh2pymeshlab(mesh) + + return mesh + + +class FaceReducer: + def __call__( + self, + mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str], + max_facenum: int = 40000 + ) -> Union[pymeshlab.MeshSet, trimesh.Trimesh]: + ms = import_mesh(mesh) + ms = reduce_face(ms, max_facenum=max_facenum) + mesh = export_mesh(mesh, ms) + return mesh + + +class FloaterRemover: + def __call__( + self, + mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str], + ) -> Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput]: + ms = import_mesh(mesh) + ms = remove_floater(ms) + mesh = export_mesh(mesh, ms) + return mesh + + +class DegenerateFaceRemover: + def __call__( + self, + mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str], + ) -> Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput]: + ms = import_mesh(mesh) + + temp_file = tempfile.NamedTemporaryFile(suffix='.ply', delete=True) + temp_file.close() + temp_file_name = temp_file.name + + ms.save_current_mesh(temp_file_name) + ms = pymeshlab.MeshSet() + ms.load_new_mesh(temp_file_name) + if os.path.exists(temp_file_name): + os.remove(temp_file_name) + + mesh = export_mesh(mesh, ms) + return mesh diff --git a/build/lib/hy3dgen/shapegen/preprocessors.py b/build/lib/hy3dgen/shapegen/preprocessors.py new file mode 100644 index 0000000000000000000000000000000000000000..2bdaff2d16cc0844d8d23c886d35c2f4e7286ff7 --- /dev/null +++ b/build/lib/hy3dgen/shapegen/preprocessors.py @@ -0,0 +1,127 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import cv2 +import numpy as np +import torch +from PIL import Image +from einops import repeat, rearrange + + +def array_to_tensor(np_array): + image_pt = torch.tensor(np_array).float() + image_pt = image_pt / 255 * 2 - 1 + image_pt = rearrange(image_pt, "h w c -> c h w") + image_pts = repeat(image_pt, "c h w -> b c h w", b=1) + return image_pts + + +class ImageProcessorV2: + def __init__(self, size=512, border_ratio=None): + self.size = size + self.border_ratio = border_ratio + + @staticmethod + def recenter(image, border_ratio: float = 0.2): + """ recenter an image to leave some empty space at the image border. + + Args: + image (ndarray): input image, float/uint8 [H, W, 3/4] + mask (ndarray): alpha mask, bool [H, W] + border_ratio (float, optional): border ratio, image will be resized to (1 - border_ratio). Defaults to 0.2. + + Returns: + ndarray: output image, float/uint8 [H, W, 3/4] + """ + + if image.shape[-1] == 4: + mask = image[..., 3] + else: + mask = np.ones_like(image[..., 0:1]) * 255 + image = np.concatenate([image, mask], axis=-1) + mask = mask[..., 0] + + H, W, C = image.shape + + size = max(H, W) + result = np.zeros((size, size, C), dtype=np.uint8) + + coords = np.nonzero(mask) + x_min, x_max = coords[0].min(), coords[0].max() + y_min, y_max = coords[1].min(), coords[1].max() + h = x_max - x_min + w = y_max - y_min + if h == 0 or w == 0: + raise ValueError('input image is empty') + desired_size = int(size * (1 - border_ratio)) + scale = desired_size / max(h, w) + h2 = int(h * scale) + w2 = int(w * scale) + x2_min = (size - h2) // 2 + x2_max = x2_min + h2 + + y2_min = (size - w2) // 2 + y2_max = y2_min + w2 + + result[x2_min:x2_max, y2_min:y2_max] = cv2.resize(image[x_min:x_max, y_min:y_max], (w2, h2), + interpolation=cv2.INTER_AREA) + + bg = np.ones((result.shape[0], result.shape[1], 3), dtype=np.uint8) * 255 + # bg = np.zeros((result.shape[0], result.shape[1], 3), dtype=np.uint8) * 255 + mask = result[..., 3:].astype(np.float32) / 255 + result = result[..., :3] * mask + bg * (1 - mask) + + mask = mask * 255 + result = result.clip(0, 255).astype(np.uint8) + mask = mask.clip(0, 255).astype(np.uint8) + return result, mask + + def __call__(self, image, border_ratio=0.15, to_tensor=True, return_mask=False, **kwargs): + if self.border_ratio is not None: + border_ratio = self.border_ratio + print(f"Using border_ratio from init: {border_ratio}") + if isinstance(image, str): + image = cv2.imread(image, cv2.IMREAD_UNCHANGED) + image, mask = self.recenter(image, border_ratio=border_ratio) + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + elif isinstance(image, Image.Image): + image = np.asarray(image) + image, mask = self.recenter(image, border_ratio=border_ratio) + + image = cv2.resize(image, (self.size, self.size), interpolation=cv2.INTER_CUBIC) + mask = cv2.resize(mask, (self.size, self.size), interpolation=cv2.INTER_NEAREST) + mask = mask[..., np.newaxis] + + if to_tensor: + image = array_to_tensor(image) + mask = array_to_tensor(mask) + if return_mask: + return image, mask + return image + + +IMAGE_PROCESSORS = { + "v2": ImageProcessorV2, +} + +DEFAULT_IMAGEPROCESSOR = 'v2' diff --git a/build/lib/hy3dgen/shapegen/schedulers.py b/build/lib/hy3dgen/shapegen/schedulers.py new file mode 100644 index 0000000000000000000000000000000000000000..0069f5cd49c5095930b588f01129a77f172171a7 --- /dev/null +++ b/build/lib/hy3dgen/shapegen/schedulers.py @@ -0,0 +1,307 @@ +# Copyright 2024 Stability AI, Katherine Crowson and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + +import numpy as np +import torch +from diffusers.configuration_utils import ConfigMixin, register_to_config +from diffusers.schedulers.scheduling_utils import SchedulerMixin +from diffusers.utils import BaseOutput, logging + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +@dataclass +class FlowMatchEulerDiscreteSchedulerOutput(BaseOutput): + """ + Output class for the scheduler's `step` function output. + + Args: + prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the + denoising loop. + """ + + prev_sample: torch.FloatTensor + + +class FlowMatchEulerDiscreteScheduler(SchedulerMixin, ConfigMixin): + """ + NOTE: this is very similar to diffusers.FlowMatchEulerDiscreteScheduler. Except our timesteps are reversed + + Euler scheduler. + + This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic + methods the library implements for all schedulers such as loading and saving. + + Args: + num_train_timesteps (`int`, defaults to 1000): + The number of diffusion steps to train the model. + timestep_spacing (`str`, defaults to `"linspace"`): + The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and + Sample Steps are Flawed](https://huggingface.co./papers/2305.08891) for more information. + shift (`float`, defaults to 1.0): + The shift value for the timestep schedule. + """ + + _compatibles = [] + order = 1 + + @register_to_config + def __init__( + self, + num_train_timesteps: int = 1000, + shift: float = 1.0, + use_dynamic_shifting=False, + ): + timesteps = np.linspace(1, num_train_timesteps, num_train_timesteps, dtype=np.float32).copy() + timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32) + + sigmas = timesteps / num_train_timesteps + if not use_dynamic_shifting: + # when use_dynamic_shifting is True, we apply the timestep shifting on the fly based on the image resolution + sigmas = shift * sigmas / (1 + (shift - 1) * sigmas) + + self.timesteps = sigmas * num_train_timesteps + + self._step_index = None + self._begin_index = None + + self.sigmas = sigmas.to("cpu") # to avoid too much CPU/GPU communication + self.sigma_min = self.sigmas[-1].item() + self.sigma_max = self.sigmas[0].item() + + @property + def step_index(self): + """ + The index counter for current timestep. It will increase 1 after each scheduler step. + """ + return self._step_index + + @property + def begin_index(self): + """ + The index for the first timestep. It should be set from pipeline with `set_begin_index` method. + """ + return self._begin_index + + # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index + def set_begin_index(self, begin_index: int = 0): + """ + Sets the begin index for the scheduler. This function should be run from pipeline before the inference. + + Args: + begin_index (`int`): + The begin index for the scheduler. + """ + self._begin_index = begin_index + + def scale_noise( + self, + sample: torch.FloatTensor, + timestep: Union[float, torch.FloatTensor], + noise: Optional[torch.FloatTensor] = None, + ) -> torch.FloatTensor: + """ + Forward process in flow-matching + + Args: + sample (`torch.FloatTensor`): + The input sample. + timestep (`int`, *optional*): + The current timestep in the diffusion chain. + + Returns: + `torch.FloatTensor`: + A scaled input sample. + """ + # Make sure sigmas and timesteps have the same device and dtype as original_samples + sigmas = self.sigmas.to(device=sample.device, dtype=sample.dtype) + + if sample.device.type == "mps" and torch.is_floating_point(timestep): + # mps does not support float64 + schedule_timesteps = self.timesteps.to(sample.device, dtype=torch.float32) + timestep = timestep.to(sample.device, dtype=torch.float32) + else: + schedule_timesteps = self.timesteps.to(sample.device) + timestep = timestep.to(sample.device) + + # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index + if self.begin_index is None: + step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timestep] + elif self.step_index is not None: + # add_noise is called after first denoising step (for inpainting) + step_indices = [self.step_index] * timestep.shape[0] + else: + # add noise is called before first denoising step to create initial latent(img2img) + step_indices = [self.begin_index] * timestep.shape[0] + + sigma = sigmas[step_indices].flatten() + while len(sigma.shape) < len(sample.shape): + sigma = sigma.unsqueeze(-1) + + sample = sigma * noise + (1.0 - sigma) * sample + + return sample + + def _sigma_to_t(self, sigma): + return sigma * self.config.num_train_timesteps + + def time_shift(self, mu: float, sigma: float, t: torch.Tensor): + return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma) + + def set_timesteps( + self, + num_inference_steps: int = None, + device: Union[str, torch.device] = None, + sigmas: Optional[List[float]] = None, + mu: Optional[float] = None, + ): + """ + Sets the discrete timesteps used for the diffusion chain (to be run before inference). + + Args: + num_inference_steps (`int`): + The number of diffusion steps used when generating samples with a pre-trained model. + device (`str` or `torch.device`, *optional*): + The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. + """ + + if self.config.use_dynamic_shifting and mu is None: + raise ValueError(" you have a pass a value for `mu` when `use_dynamic_shifting` is set to be `True`") + + if sigmas is None: + self.num_inference_steps = num_inference_steps + timesteps = np.linspace( + self._sigma_to_t(self.sigma_max), self._sigma_to_t(self.sigma_min), num_inference_steps + ) + + sigmas = timesteps / self.config.num_train_timesteps + + if self.config.use_dynamic_shifting: + sigmas = self.time_shift(mu, 1.0, sigmas) + else: + sigmas = self.config.shift * sigmas / (1 + (self.config.shift - 1) * sigmas) + + sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device) + timesteps = sigmas * self.config.num_train_timesteps + + self.timesteps = timesteps.to(device=device) + self.sigmas = torch.cat([sigmas, torch.ones(1, device=sigmas.device)]) + + self._step_index = None + self._begin_index = None + + def index_for_timestep(self, timestep, schedule_timesteps=None): + if schedule_timesteps is None: + schedule_timesteps = self.timesteps + + indices = (schedule_timesteps == timestep).nonzero() + + # The sigma index that is taken for the **very** first `step` + # is always the second index (or the last index if there is only 1) + # This way we can ensure we don't accidentally skip a sigma in + # case we start in the middle of the denoising schedule (e.g. for image-to-image) + pos = 1 if len(indices) > 1 else 0 + + return indices[pos].item() + + def _init_step_index(self, timestep): + if self.begin_index is None: + if isinstance(timestep, torch.Tensor): + timestep = timestep.to(self.timesteps.device) + self._step_index = self.index_for_timestep(timestep) + else: + self._step_index = self._begin_index + + def step( + self, + model_output: torch.FloatTensor, + timestep: Union[float, torch.FloatTensor], + sample: torch.FloatTensor, + s_churn: float = 0.0, + s_tmin: float = 0.0, + s_tmax: float = float("inf"), + s_noise: float = 1.0, + generator: Optional[torch.Generator] = None, + return_dict: bool = True, + ) -> Union[FlowMatchEulerDiscreteSchedulerOutput, Tuple]: + """ + Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion + process from the learned model outputs (most often the predicted noise). + + Args: + model_output (`torch.FloatTensor`): + The direct output from learned diffusion model. + timestep (`float`): + The current discrete timestep in the diffusion chain. + sample (`torch.FloatTensor`): + A current instance of a sample created by the diffusion process. + s_churn (`float`): + s_tmin (`float`): + s_tmax (`float`): + s_noise (`float`, defaults to 1.0): + Scaling factor for noise added to the sample. + generator (`torch.Generator`, *optional*): + A random number generator. + return_dict (`bool`): + Whether or not to return a [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or + tuple. + + Returns: + [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or `tuple`: + If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is + returned, otherwise a tuple is returned where the first element is the sample tensor. + """ + + if ( + isinstance(timestep, int) + or isinstance(timestep, torch.IntTensor) + or isinstance(timestep, torch.LongTensor) + ): + raise ValueError( + ( + "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to" + " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass" + " one of the `scheduler.timesteps` as a timestep." + ), + ) + + if self.step_index is None: + self._init_step_index(timestep) + + # Upcast to avoid precision issues when computing prev_sample + sample = sample.to(torch.float32) + + sigma = self.sigmas[self.step_index] + sigma_next = self.sigmas[self.step_index + 1] + + prev_sample = sample + (sigma_next - sigma) * model_output + + # Cast sample back to model compatible dtype + prev_sample = prev_sample.to(model_output.dtype) + + # upon completion increase step index by one + self._step_index += 1 + + if not return_dict: + return (prev_sample,) + + return FlowMatchEulerDiscreteSchedulerOutput(prev_sample=prev_sample) + + def __len__(self): + return self.config.num_train_timesteps diff --git a/build/lib/hy3dgen/texgen/__init__.py b/build/lib/hy3dgen/texgen/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1f890f024d507021eca8087d40dc472de36152bd --- /dev/null +++ b/build/lib/hy3dgen/texgen/__init__.py @@ -0,0 +1,26 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + + +from .pipelines import Hunyuan3DPaintPipeline, Hunyuan3DTexGenConfig diff --git a/build/lib/hy3dgen/texgen/differentiable_renderer/__init__.py b/build/lib/hy3dgen/texgen/differentiable_renderer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e307c3f8c1292da02f308e4b59ef0bcd6fe7305e --- /dev/null +++ b/build/lib/hy3dgen/texgen/differentiable_renderer/__init__.py @@ -0,0 +1,23 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. diff --git a/build/lib/hy3dgen/texgen/differentiable_renderer/camera_utils.py b/build/lib/hy3dgen/texgen/differentiable_renderer/camera_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..289710ab787a174b39154f1010fc6209e4c92dfe --- /dev/null +++ b/build/lib/hy3dgen/texgen/differentiable_renderer/camera_utils.py @@ -0,0 +1,116 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import math + +import numpy as np +import torch + + +def transform_pos(mtx, pos, keepdim=False): + t_mtx = torch.from_numpy(mtx).to( + pos.device) if isinstance( + mtx, np.ndarray) else mtx + if pos.shape[-1] == 3: + posw = torch.cat( + [pos, torch.ones([pos.shape[0], 1]).to(pos.device)], axis=1) + else: + posw = pos + + if keepdim: + return torch.matmul(posw, t_mtx.t())[...] + else: + return torch.matmul(posw, t_mtx.t())[None, ...] + + +def get_mv_matrix(elev, azim, camera_distance, center=None): + elev = -elev + azim += 90 + + elev_rad = math.radians(elev) + azim_rad = math.radians(azim) + + camera_position = np.array([camera_distance * math.cos(elev_rad) * math.cos(azim_rad), + camera_distance * + math.cos(elev_rad) * math.sin(azim_rad), + camera_distance * math.sin(elev_rad)]) + + if center is None: + center = np.array([0, 0, 0]) + else: + center = np.array(center) + + lookat = center - camera_position + lookat = lookat / np.linalg.norm(lookat) + + up = np.array([0, 0, 1.0]) + right = np.cross(lookat, up) + right = right / np.linalg.norm(right) + up = np.cross(right, lookat) + up = up / np.linalg.norm(up) + + c2w = np.concatenate( + [np.stack([right, up, -lookat], axis=-1), camera_position[:, None]], axis=-1) + + w2c = np.zeros((4, 4)) + w2c[:3, :3] = np.transpose(c2w[:3, :3], (1, 0)) + w2c[:3, 3:] = -np.matmul(np.transpose(c2w[:3, :3], (1, 0)), c2w[:3, 3:]) + w2c[3, 3] = 1.0 + + return w2c.astype(np.float32) + + +def get_orthographic_projection_matrix( + left=-1, right=1, bottom=-1, top=1, near=0, far=2): + """ + 计算正交投影矩阵。 + + 参数: + left (float): 投影区域左侧边界。 + right (float): 投影区域右侧边界。 + bottom (float): 投影区域底部边界。 + top (float): 投影区域顶部边界。 + near (float): 投影区域近裁剪面距离。 + far (float): 投影区域远裁剪面距离。 + + 返回: + numpy.ndarray: 正交投影矩阵。 + """ + ortho_matrix = np.eye(4, dtype=np.float32) + ortho_matrix[0, 0] = 2 / (right - left) + ortho_matrix[1, 1] = 2 / (top - bottom) + ortho_matrix[2, 2] = -2 / (far - near) + ortho_matrix[0, 3] = -(right + left) / (right - left) + ortho_matrix[1, 3] = -(top + bottom) / (top - bottom) + ortho_matrix[2, 3] = -(far + near) / (far - near) + return ortho_matrix + + +def get_perspective_projection_matrix(fovy, aspect_wh, near, far): + fovy_rad = math.radians(fovy) + return np.array([[1.0 / (math.tan(fovy_rad / 2.0) * aspect_wh), 0, 0, 0], + [0, 1.0 / math.tan(fovy_rad / 2.0), 0, 0], + [0, 0, -(far + near) / (far - near), - + 2.0 * far * near / (far - near)], + [0, 0, -1, 0]]).astype(np.float32) diff --git a/build/lib/hy3dgen/texgen/differentiable_renderer/mesh_processor.py b/build/lib/hy3dgen/texgen/differentiable_renderer/mesh_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..a96955c19757df5ad18095b33829962140c04647 --- /dev/null +++ b/build/lib/hy3dgen/texgen/differentiable_renderer/mesh_processor.py @@ -0,0 +1,70 @@ +import numpy as np + +def meshVerticeInpaint_smooth(texture, mask, vtx_pos, vtx_uv, pos_idx, uv_idx): + texture_height, texture_width, texture_channel = texture.shape + vtx_num = vtx_pos.shape[0] + + vtx_mask = np.zeros(vtx_num, dtype=np.float32) + vtx_color = [np.zeros(texture_channel, dtype=np.float32) for _ in range(vtx_num)] + uncolored_vtxs = [] + G = [[] for _ in range(vtx_num)] + + for i in range(uv_idx.shape[0]): + for k in range(3): + vtx_uv_idx = uv_idx[i, k] + vtx_idx = pos_idx[i, k] + uv_v = int(round(vtx_uv[vtx_uv_idx, 0] * (texture_width - 1))) + uv_u = int(round((1.0 - vtx_uv[vtx_uv_idx, 1]) * (texture_height - 1))) + if mask[uv_u, uv_v] > 0: + vtx_mask[vtx_idx] = 1.0 + vtx_color[vtx_idx] = texture[uv_u, uv_v] + else: + uncolored_vtxs.append(vtx_idx) + G[pos_idx[i, k]].append(pos_idx[i, (k + 1) % 3]) + + smooth_count = 2 + last_uncolored_vtx_count = 0 + while smooth_count > 0: + uncolored_vtx_count = 0 + for vtx_idx in uncolored_vtxs: + sum_color = np.zeros(texture_channel, dtype=np.float32) + total_weight = 0.0 + vtx_0 = vtx_pos[vtx_idx] + for connected_idx in G[vtx_idx]: + if vtx_mask[connected_idx] > 0: + vtx1 = vtx_pos[connected_idx] + dist = np.sqrt(np.sum((vtx_0 - vtx1) ** 2)) + dist_weight = 1.0 / max(dist, 1e-4) + dist_weight *= dist_weight + sum_color += vtx_color[connected_idx] * dist_weight + total_weight += dist_weight + if total_weight > 0: + vtx_color[vtx_idx] = sum_color / total_weight + vtx_mask[vtx_idx] = 1.0 + else: + uncolored_vtx_count += 1 + + if last_uncolored_vtx_count == uncolored_vtx_count: + smooth_count -= 1 + else: + smooth_count += 1 + last_uncolored_vtx_count = uncolored_vtx_count + + new_texture = texture.copy() + new_mask = mask.copy() + for face_idx in range(uv_idx.shape[0]): + for k in range(3): + vtx_uv_idx = uv_idx[face_idx, k] + vtx_idx = pos_idx[face_idx, k] + if vtx_mask[vtx_idx] == 1.0: + uv_v = int(round(vtx_uv[vtx_uv_idx, 0] * (texture_width - 1))) + uv_u = int(round((1.0 - vtx_uv[vtx_uv_idx, 1]) * (texture_height - 1))) + new_texture[uv_u, uv_v] = vtx_color[vtx_idx] + new_mask[uv_u, uv_v] = 255 + return new_texture, new_mask + +def meshVerticeInpaint(texture, mask, vtx_pos, vtx_uv, pos_idx, uv_idx, method="smooth"): + if method == "smooth": + return meshVerticeInpaint_smooth(texture, mask, vtx_pos, vtx_uv, pos_idx, uv_idx) + else: + raise ValueError("Invalid method. Use 'smooth' or 'forward'.") \ No newline at end of file diff --git a/build/lib/hy3dgen/texgen/differentiable_renderer/mesh_render.py b/build/lib/hy3dgen/texgen/differentiable_renderer/mesh_render.py new file mode 100644 index 0000000000000000000000000000000000000000..c85b80e043221282e9ff6bfb81764fb32c5d48ed --- /dev/null +++ b/build/lib/hy3dgen/texgen/differentiable_renderer/mesh_render.py @@ -0,0 +1,833 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import cv2 +import numpy as np +import torch +import torch.nn.functional as F +import trimesh +from PIL import Image + +from .camera_utils import ( + transform_pos, + get_mv_matrix, + get_orthographic_projection_matrix, + get_perspective_projection_matrix, +) +from .mesh_processor import meshVerticeInpaint +from .mesh_utils import load_mesh, save_mesh + + +def stride_from_shape(shape): + stride = [1] + for x in reversed(shape[1:]): + stride.append(stride[-1] * x) + return list(reversed(stride)) + + +def scatter_add_nd_with_count(input, count, indices, values, weights=None): + # input: [..., C], D dimension + C channel + # count: [..., 1], D dimension + # indices: [N, D], long + # values: [N, C] + + D = indices.shape[-1] + C = input.shape[-1] + size = input.shape[:-1] + stride = stride_from_shape(size) + + assert len(size) == D + + input = input.view(-1, C) # [HW, C] + count = count.view(-1, 1) + + flatten_indices = (indices * torch.tensor(stride, + dtype=torch.long, device=indices.device)).sum(-1) # [N] + + if weights is None: + weights = torch.ones_like(values[..., :1]) + + input.scatter_add_(0, flatten_indices.unsqueeze(1).repeat(1, C), values) + count.scatter_add_(0, flatten_indices.unsqueeze(1), weights) + + return input.view(*size, C), count.view(*size, 1) + + +def linear_grid_put_2d(H, W, coords, values, return_count=False): + # coords: [N, 2], float in [0, 1] + # values: [N, C] + + C = values.shape[-1] + + indices = coords * torch.tensor( + [H - 1, W - 1], dtype=torch.float32, device=coords.device + ) + indices_00 = indices.floor().long() # [N, 2] + indices_00[:, 0].clamp_(0, H - 2) + indices_00[:, 1].clamp_(0, W - 2) + indices_01 = indices_00 + torch.tensor( + [0, 1], dtype=torch.long, device=indices.device + ) + indices_10 = indices_00 + torch.tensor( + [1, 0], dtype=torch.long, device=indices.device + ) + indices_11 = indices_00 + torch.tensor( + [1, 1], dtype=torch.long, device=indices.device + ) + + h = indices[..., 0] - indices_00[..., 0].float() + w = indices[..., 1] - indices_00[..., 1].float() + w_00 = (1 - h) * (1 - w) + w_01 = (1 - h) * w + w_10 = h * (1 - w) + w_11 = h * w + + result = torch.zeros(H, W, C, device=values.device, + dtype=values.dtype) # [H, W, C] + count = torch.zeros(H, W, 1, device=values.device, + dtype=values.dtype) # [H, W, 1] + weights = torch.ones_like(values[..., :1]) # [N, 1] + + result, count = scatter_add_nd_with_count( + result, count, indices_00, values * w_00.unsqueeze(1), weights * w_00.unsqueeze(1)) + result, count = scatter_add_nd_with_count( + result, count, indices_01, values * w_01.unsqueeze(1), weights * w_01.unsqueeze(1)) + result, count = scatter_add_nd_with_count( + result, count, indices_10, values * w_10.unsqueeze(1), weights * w_10.unsqueeze(1)) + result, count = scatter_add_nd_with_count( + result, count, indices_11, values * w_11.unsqueeze(1), weights * w_11.unsqueeze(1)) + + if return_count: + return result, count + + mask = (count.squeeze(-1) > 0) + result[mask] = result[mask] / count[mask].repeat(1, C) + + return result + + +class MeshRender(): + def __init__( + self, + camera_distance=1.45, camera_type='orth', + default_resolution=1024, texture_size=1024, + use_antialias=True, max_mip_level=None, filter_mode='linear', + bake_mode='linear', raster_mode='cr', device='cuda'): + + self.device = device + + self.set_default_render_resolution(default_resolution) + self.set_default_texture_resolution(texture_size) + + self.camera_distance = camera_distance + self.use_antialias = use_antialias + self.max_mip_level = max_mip_level + self.filter_mode = filter_mode + + self.bake_angle_thres = 75 + self.bake_unreliable_kernel_size = int( + (2 / 512) * max(self.default_resolution[0], self.default_resolution[1])) + self.bake_mode = bake_mode + + self.raster_mode = raster_mode + if self.raster_mode == 'cr': + import custom_rasterizer as cr + self.raster = cr + else: + raise f'No raster named {self.raster_mode}' + + if camera_type == 'orth': + self.ortho_scale = 1.2 + self.camera_proj_mat = get_orthographic_projection_matrix( + left=-self.ortho_scale * 0.5, right=self.ortho_scale * 0.5, + bottom=-self.ortho_scale * 0.5, top=self.ortho_scale * 0.5, + near=0.1, far=100 + ) + elif camera_type == 'perspective': + self.camera_proj_mat = get_perspective_projection_matrix( + 49.13, self.default_resolution[1] / self.default_resolution[0], + 0.01, 100.0 + ) + else: + raise f'No camera type {camera_type}' + + def raster_rasterize(self, pos, tri, resolution, ranges=None, grad_db=True): + + if self.raster_mode == 'cr': + rast_out_db = None + if pos.dim() == 2: + pos = pos.unsqueeze(0) + findices, barycentric = self.raster.rasterize(pos, tri, resolution) + rast_out = torch.cat((barycentric, findices.unsqueeze(-1)), dim=-1) + rast_out = rast_out.unsqueeze(0) + else: + raise f'No raster named {self.raster_mode}' + + return rast_out, rast_out_db + + def raster_interpolate(self, uv, rast_out, uv_idx, rast_db=None, diff_attrs=None): + + if self.raster_mode == 'cr': + textd = None + barycentric = rast_out[0, ..., :-1] + findices = rast_out[0, ..., -1] + if uv.dim() == 2: + uv = uv.unsqueeze(0) + textc = self.raster.interpolate(uv, findices, barycentric, uv_idx) + else: + raise f'No raster named {self.raster_mode}' + + return textc, textd + + def raster_texture(self, tex, uv, uv_da=None, mip_level_bias=None, mip=None, filter_mode='auto', + boundary_mode='wrap', max_mip_level=None): + + if self.raster_mode == 'cr': + raise f'Texture is not implemented in cr' + else: + raise f'No raster named {self.raster_mode}' + + return color + + def raster_antialias(self, color, rast, pos, tri, topology_hash=None, pos_gradient_boost=1.0): + + if self.raster_mode == 'cr': + # Antialias has not been supported yet + color = color + else: + raise f'No raster named {self.raster_mode}' + + return color + + def load_mesh( + self, + mesh, + scale_factor=1.15, + auto_center=True, + ): + vtx_pos, pos_idx, vtx_uv, uv_idx, texture_data = load_mesh(mesh) + self.mesh_copy = mesh + self.set_mesh(vtx_pos, pos_idx, + vtx_uv=vtx_uv, uv_idx=uv_idx, + scale_factor=scale_factor, auto_center=auto_center + ) + if texture_data is not None: + self.set_texture(texture_data) + + def save_mesh(self): + texture_data = self.get_texture() + texture_data = Image.fromarray((texture_data * 255).astype(np.uint8)) + return save_mesh(self.mesh_copy, texture_data) + + def set_mesh( + self, + vtx_pos, pos_idx, + vtx_uv=None, uv_idx=None, + scale_factor=1.15, auto_center=True + ): + + self.vtx_pos = torch.from_numpy(vtx_pos).to(self.device).float() + self.pos_idx = torch.from_numpy(pos_idx).to(self.device).to(torch.int) + if (vtx_uv is not None) and (uv_idx is not None): + self.vtx_uv = torch.from_numpy(vtx_uv).to(self.device).float() + self.uv_idx = torch.from_numpy(uv_idx).to(self.device).to(torch.int) + else: + self.vtx_uv = None + self.uv_idx = None + + self.vtx_pos[:, [0, 1]] = -self.vtx_pos[:, [0, 1]] + self.vtx_pos[:, [1, 2]] = self.vtx_pos[:, [2, 1]] + if (vtx_uv is not None) and (uv_idx is not None): + self.vtx_uv[:, 1] = 1.0 - self.vtx_uv[:, 1] + + if auto_center: + max_bb = (self.vtx_pos - 0).max(0)[0] + min_bb = (self.vtx_pos - 0).min(0)[0] + center = (max_bb + min_bb) / 2 + scale = torch.norm(self.vtx_pos - center, dim=1).max() * 2.0 + self.vtx_pos = (self.vtx_pos - center) * \ + (scale_factor / float(scale)) + self.scale_factor = scale_factor + + def set_texture(self, tex): + if isinstance(tex, np.ndarray): + tex = Image.fromarray((tex * 255).astype(np.uint8)) + elif isinstance(tex, torch.Tensor): + tex = tex.cpu().numpy() + tex = Image.fromarray((tex * 255).astype(np.uint8)) + + tex = tex.resize(self.texture_size).convert('RGB') + tex = np.array(tex) / 255.0 + self.tex = torch.from_numpy(tex).to(self.device) + self.tex = self.tex.float() + + def set_default_render_resolution(self, default_resolution): + if isinstance(default_resolution, int): + default_resolution = (default_resolution, default_resolution) + self.default_resolution = default_resolution + + def set_default_texture_resolution(self, texture_size): + if isinstance(texture_size, int): + texture_size = (texture_size, texture_size) + self.texture_size = texture_size + + def get_mesh(self): + vtx_pos = self.vtx_pos.cpu().numpy() + pos_idx = self.pos_idx.cpu().numpy() + vtx_uv = self.vtx_uv.cpu().numpy() + uv_idx = self.uv_idx.cpu().numpy() + + # 坐标变换的逆变换 + vtx_pos[:, [1, 2]] = vtx_pos[:, [2, 1]] + vtx_pos[:, [0, 1]] = -vtx_pos[:, [0, 1]] + + vtx_uv[:, 1] = 1.0 - vtx_uv[:, 1] + return vtx_pos, pos_idx, vtx_uv, uv_idx + + def get_texture(self): + return self.tex.cpu().numpy() + + def to(self, device): + self.device = device + + for attr_name in dir(self): + attr_value = getattr(self, attr_name) + if isinstance(attr_value, torch.Tensor): + setattr(self, attr_name, attr_value.to(self.device)) + + def color_rgb_to_srgb(self, image): + if isinstance(image, Image.Image): + image_rgb = torch.tesnor( + np.array(image) / + 255.0).float().to( + self.device) + elif isinstance(image, np.ndarray): + image_rgb = torch.tensor(image).float() + else: + image_rgb = image.to(self.device) + + image_srgb = torch.where( + image_rgb <= 0.0031308, + 12.92 * image_rgb, + 1.055 * torch.pow(image_rgb, 1 / 2.4) - 0.055 + ) + + if isinstance(image, Image.Image): + image_srgb = Image.fromarray( + (image_srgb.cpu().numpy() * + 255).astype( + np.uint8)) + elif isinstance(image, np.ndarray): + image_srgb = image_srgb.cpu().numpy() + else: + image_srgb = image_srgb.to(image.device) + + return image_srgb + + def _render( + self, + glctx, + mvp, + pos, + pos_idx, + uv, + uv_idx, + tex, + resolution, + max_mip_level, + keep_alpha, + filter_mode + ): + pos_clip = transform_pos(mvp, pos) + if isinstance(resolution, (int, float)): + resolution = [resolution, resolution] + rast_out, rast_out_db = self.raster_rasterize( + glctx, pos_clip, pos_idx, resolution=resolution) + + tex = tex.contiguous() + if filter_mode == 'linear-mipmap-linear': + texc, texd = self.raster_interpolate( + uv[None, ...], rast_out, uv_idx, rast_db=rast_out_db, diff_attrs='all') + color = self.raster_texture( + tex[None, ...], texc, texd, filter_mode='linear-mipmap-linear', max_mip_level=max_mip_level) + else: + texc, _ = self.raster_interpolate(uv[None, ...], rast_out, uv_idx) + color = self.raster_texture(tex[None, ...], texc, filter_mode=filter_mode) + + visible_mask = torch.clamp(rast_out[..., -1:], 0, 1) + color = color * visible_mask # Mask out background. + if self.use_antialias: + color = self.raster_antialias(color, rast_out, pos_clip, pos_idx) + + if keep_alpha: + color = torch.cat([color, visible_mask], dim=-1) + return color[0, ...] + + def render( + self, + elev, + azim, + camera_distance=None, + center=None, + resolution=None, + tex=None, + keep_alpha=True, + bgcolor=None, + filter_mode=None, + return_type='th' + ): + + proj = self.camera_proj_mat + r_mv = get_mv_matrix( + elev=elev, + azim=azim, + camera_distance=self.camera_distance if camera_distance is None else camera_distance, + center=center) + r_mvp = np.matmul(proj, r_mv).astype(np.float32) + if tex is not None: + if isinstance(tex, Image.Image): + tex = torch.tensor(np.array(tex) / 255.0) + elif isinstance(tex, np.ndarray): + tex = torch.tensor(tex) + if tex.dim() == 2: + tex = tex.unsqueeze(-1) + tex = tex.float().to(self.device) + image = self._render(r_mvp, self.vtx_pos, self.pos_idx, self.vtx_uv, self.uv_idx, + self.tex if tex is None else tex, + self.default_resolution if resolution is None else resolution, + self.max_mip_level, True, filter_mode if filter_mode else self.filter_mode) + mask = (image[..., [-1]] == 1).float() + if bgcolor is None: + bgcolor = [0 for _ in range(image.shape[-1] - 1)] + image = image * mask + (1 - mask) * \ + torch.tensor(bgcolor + [0]).to(self.device) + if keep_alpha == False: + image = image[..., :-1] + if return_type == 'np': + image = image.cpu().numpy() + elif return_type == 'pl': + image = image.squeeze(-1).cpu().numpy() * 255 + image = Image.fromarray(image.astype(np.uint8)) + return image + + def render_normal( + self, + elev, + azim, + camera_distance=None, + center=None, + resolution=None, + bg_color=[1, 1, 1], + use_abs_coor=False, + normalize_rgb=True, + return_type='th' + ): + + pos_camera, pos_clip = self.get_pos_from_mvp(elev, azim, camera_distance, center) + if resolution is None: + resolution = self.default_resolution + if isinstance(resolution, (int, float)): + resolution = [resolution, resolution] + rast_out, rast_out_db = self.raster_rasterize( + pos_clip, self.pos_idx, resolution=resolution) + + if use_abs_coor: + mesh_triangles = self.vtx_pos[self.pos_idx[:, :3], :] + else: + pos_camera = pos_camera[:, :3] / pos_camera[:, 3:4] + mesh_triangles = pos_camera[self.pos_idx[:, :3], :] + face_normals = F.normalize( + torch.cross(mesh_triangles[:, + 1, + :] - mesh_triangles[:, + 0, + :], + mesh_triangles[:, + 2, + :] - mesh_triangles[:, + 0, + :], + dim=-1), + dim=-1) + + vertex_normals = trimesh.geometry.mean_vertex_normals(vertex_count=self.vtx_pos.shape[0], + faces=self.pos_idx.cpu(), + face_normals=face_normals.cpu(), ) + vertex_normals = torch.from_numpy( + vertex_normals).float().to(self.device).contiguous() + + # Interpolate normal values across the rasterized pixels + normal, _ = self.raster_interpolate( + vertex_normals[None, ...], rast_out, self.pos_idx) + + visible_mask = torch.clamp(rast_out[..., -1:], 0, 1) + normal = normal * visible_mask + \ + torch.tensor(bg_color, dtype=torch.float32, device=self.device) * (1 - + visible_mask) # Mask out background. + + if normalize_rgb: + normal = (normal + 1) * 0.5 + if self.use_antialias: + normal = self.raster_antialias(normal, rast_out, pos_clip, self.pos_idx) + + image = normal[0, ...] + if return_type == 'np': + image = image.cpu().numpy() + elif return_type == 'pl': + image = image.cpu().numpy() * 255 + image = Image.fromarray(image.astype(np.uint8)) + + return image + + def convert_normal_map(self, image): + # blue is front, red is left, green is top + if isinstance(image, Image.Image): + image = np.array(image) + mask = (image == [255, 255, 255]).all(axis=-1) + + image = (image / 255.0) * 2.0 - 1.0 + + image[..., [1]] = -image[..., [1]] + image[..., [1, 2]] = image[..., [2, 1]] + image[..., [0]] = -image[..., [0]] + + image = (image + 1.0) * 0.5 + + image = (image * 255).astype(np.uint8) + image[mask] = [127, 127, 255] + + return Image.fromarray(image) + + def get_pos_from_mvp(self, elev, azim, camera_distance, center): + proj = self.camera_proj_mat + r_mv = get_mv_matrix( + elev=elev, + azim=azim, + camera_distance=self.camera_distance if camera_distance is None else camera_distance, + center=center) + + pos_camera = transform_pos(r_mv, self.vtx_pos, keepdim=True) + pos_clip = transform_pos(proj, pos_camera) + + return pos_camera, pos_clip + + def render_depth( + self, + elev, + azim, + camera_distance=None, + center=None, + resolution=None, + return_type='th' + ): + pos_camera, pos_clip = self.get_pos_from_mvp(elev, azim, camera_distance, center) + + if resolution is None: + resolution = self.default_resolution + if isinstance(resolution, (int, float)): + resolution = [resolution, resolution] + rast_out, rast_out_db = self.raster_rasterize( + pos_clip, self.pos_idx, resolution=resolution) + + pos_camera = pos_camera[:, :3] / pos_camera[:, 3:4] + tex_depth = pos_camera[:, 2].reshape(1, -1, 1).contiguous() + + # Interpolate depth values across the rasterized pixels + depth, _ = self.raster_interpolate(tex_depth, rast_out, self.pos_idx) + + visible_mask = torch.clamp(rast_out[..., -1:], 0, 1) + depth_max, depth_min = depth[visible_mask > + 0].max(), depth[visible_mask > 0].min() + depth = (depth - depth_min) / (depth_max - depth_min) + + depth = depth * visible_mask # Mask out background. + if self.use_antialias: + depth = self.raster_antialias(depth, rast_out, pos_clip, self.pos_idx) + + image = depth[0, ...] + if return_type == 'np': + image = image.cpu().numpy() + elif return_type == 'pl': + image = image.squeeze(-1).cpu().numpy() * 255 + image = Image.fromarray(image.astype(np.uint8)) + return image + + def render_position(self, elev, azim, camera_distance=None, center=None, + resolution=None, bg_color=[1, 1, 1], return_type='th'): + pos_camera, pos_clip = self.get_pos_from_mvp(elev, azim, camera_distance, center) + if resolution is None: + resolution = self.default_resolution + if isinstance(resolution, (int, float)): + resolution = [resolution, resolution] + rast_out, rast_out_db = self.raster_rasterize( + pos_clip, self.pos_idx, resolution=resolution) + + tex_position = 0.5 - self.vtx_pos[:, :3] / self.scale_factor + tex_position = tex_position.contiguous() + + # Interpolate depth values across the rasterized pixels + position, _ = self.raster_interpolate( + tex_position[None, ...], rast_out, self.pos_idx) + + visible_mask = torch.clamp(rast_out[..., -1:], 0, 1) + + position = position * visible_mask + \ + torch.tensor(bg_color, dtype=torch.float32, device=self.device) * (1 - + visible_mask) # Mask out background. + if self.use_antialias: + position = self.raster_antialias(position, rast_out, pos_clip, self.pos_idx) + + image = position[0, ...] + + if return_type == 'np': + image = image.cpu().numpy() + elif return_type == 'pl': + image = image.squeeze(-1).cpu().numpy() * 255 + image = Image.fromarray(image.astype(np.uint8)) + return image + + def render_uvpos(self, return_type='th'): + image = self.uv_feature_map(self.vtx_pos * 0.5 + 0.5) + if return_type == 'np': + image = image.cpu().numpy() + elif return_type == 'pl': + image = image.cpu().numpy() * 255 + image = Image.fromarray(image.astype(np.uint8)) + return image + + def uv_feature_map(self, vert_feat, bg=None): + vtx_uv = self.vtx_uv * 2 - 1.0 + vtx_uv = torch.cat( + [vtx_uv, torch.zeros_like(self.vtx_uv)], dim=1).unsqueeze(0) + vtx_uv[..., -1] = 1 + uv_idx = self.uv_idx + rast_out, rast_out_db = self.raster_rasterize( + vtx_uv, uv_idx, resolution=self.texture_size) + feat_map, _ = self.raster_interpolate(vert_feat[None, ...], rast_out, uv_idx) + feat_map = feat_map[0, ...] + if bg is not None: + visible_mask = torch.clamp(rast_out[..., -1:], 0, 1)[0, ...] + feat_map[visible_mask == 0] = bg + return feat_map + + def render_sketch_from_geometry(self, normal_image, depth_image): + normal_image_np = normal_image.cpu().numpy() + depth_image_np = depth_image.cpu().numpy() + + normal_image_np = (normal_image_np * 255).astype(np.uint8) + depth_image_np = (depth_image_np * 255).astype(np.uint8) + normal_image_np = cv2.cvtColor(normal_image_np, cv2.COLOR_RGB2GRAY) + + normal_edges = cv2.Canny(normal_image_np, 80, 150) + depth_edges = cv2.Canny(depth_image_np, 30, 80) + + combined_edges = np.maximum(normal_edges, depth_edges) + + sketch_image = torch.from_numpy(combined_edges).to( + normal_image.device).float() / 255.0 + sketch_image = sketch_image.unsqueeze(-1) + + return sketch_image + + def render_sketch_from_depth(self, depth_image): + depth_image_np = depth_image.cpu().numpy() + depth_image_np = (depth_image_np * 255).astype(np.uint8) + depth_edges = cv2.Canny(depth_image_np, 30, 80) + combined_edges = depth_edges + sketch_image = torch.from_numpy(combined_edges).to( + depth_image.device).float() / 255.0 + sketch_image = sketch_image.unsqueeze(-1) + return sketch_image + + def back_project(self, image, elev, azim, + camera_distance=None, center=None, method=None): + if isinstance(image, Image.Image): + image = torch.tensor(np.array(image) / 255.0) + elif isinstance(image, np.ndarray): + image = torch.tensor(image) + if image.dim() == 2: + image = image.unsqueeze(-1) + image = image.float().to(self.device) + resolution = image.shape[:2] + channel = image.shape[-1] + texture = torch.zeros(self.texture_size + (channel,)).to(self.device) + cos_map = torch.zeros(self.texture_size + (1,)).to(self.device) + + proj = self.camera_proj_mat + r_mv = get_mv_matrix( + elev=elev, + azim=azim, + camera_distance=self.camera_distance if camera_distance is None else camera_distance, + center=center) + pos_camera = transform_pos(r_mv, self.vtx_pos, keepdim=True) + pos_clip = transform_pos(proj, pos_camera) + pos_camera = pos_camera[:, :3] / pos_camera[:, 3:4] + v0 = pos_camera[self.pos_idx[:, 0], :] + v1 = pos_camera[self.pos_idx[:, 1], :] + v2 = pos_camera[self.pos_idx[:, 2], :] + face_normals = F.normalize( + torch.cross( + v1 - v0, + v2 - v0, + dim=-1), + dim=-1) + vertex_normals = trimesh.geometry.mean_vertex_normals(vertex_count=self.vtx_pos.shape[0], + faces=self.pos_idx.cpu(), + face_normals=face_normals.cpu(), ) + vertex_normals = torch.from_numpy( + vertex_normals).float().to(self.device).contiguous() + tex_depth = pos_camera[:, 2].reshape(1, -1, 1).contiguous() + rast_out, rast_out_db = self.raster_rasterize( + pos_clip, self.pos_idx, resolution=resolution) + visible_mask = torch.clamp(rast_out[..., -1:], 0, 1)[0, ...] + + normal, _ = self.raster_interpolate( + vertex_normals[None, ...], rast_out, self.pos_idx) + normal = normal[0, ...] + uv, _ = self.raster_interpolate(self.vtx_uv[None, ...], rast_out, self.uv_idx) + depth, _ = self.raster_interpolate(tex_depth, rast_out, self.pos_idx) + depth = depth[0, ...] + + depth_max, depth_min = depth[visible_mask > + 0].max(), depth[visible_mask > 0].min() + depth_normalized = (depth - depth_min) / (depth_max - depth_min) + depth_image = depth_normalized * visible_mask # Mask out background. + + sketch_image = self.render_sketch_from_depth(depth_image) + + lookat = torch.tensor([[0, 0, -1]], device=self.device) + cos_image = torch.nn.functional.cosine_similarity( + lookat, normal.view(-1, 3)) + cos_image = cos_image.view(normal.shape[0], normal.shape[1], 1) + + cos_thres = np.cos(self.bake_angle_thres / 180 * np.pi) + cos_image[cos_image < cos_thres] = 0 + + # shrink + kernel_size = self.bake_unreliable_kernel_size * 2 + 1 + kernel = torch.ones( + (1, 1, kernel_size, kernel_size), dtype=torch.float32).to( + sketch_image.device) + + visible_mask = visible_mask.permute(2, 0, 1).unsqueeze(0).float() + visible_mask = F.conv2d( + 1.0 - visible_mask, + kernel, + padding=kernel_size // 2) + visible_mask = 1.0 - (visible_mask > 0).float() # 二值化 + visible_mask = visible_mask.squeeze(0).permute(1, 2, 0) + + sketch_image = sketch_image.permute(2, 0, 1).unsqueeze(0) + sketch_image = F.conv2d(sketch_image, kernel, padding=kernel_size // 2) + sketch_image = (sketch_image > 0).float() # 二值化 + sketch_image = sketch_image.squeeze(0).permute(1, 2, 0) + visible_mask = visible_mask * (sketch_image < 0.5) + + cos_image[visible_mask == 0] = 0 + + method = self.bake_mode if method is None else method + + if method == 'linear': + proj_mask = (visible_mask != 0).view(-1) + uv = uv.squeeze(0).contiguous().view(-1, 2)[proj_mask] + image = image.squeeze(0).contiguous().view(-1, channel)[proj_mask] + cos_image = cos_image.contiguous().view(-1, 1)[proj_mask] + sketch_image = sketch_image.contiguous().view(-1, 1)[proj_mask] + + texture = linear_grid_put_2d( + self.texture_size[1], self.texture_size[0], uv[..., [1, 0]], image) + cos_map = linear_grid_put_2d( + self.texture_size[1], self.texture_size[0], uv[..., [1, 0]], cos_image) + boundary_map = linear_grid_put_2d( + self.texture_size[1], self.texture_size[0], uv[..., [1, 0]], sketch_image) + else: + raise f'No bake mode {method}' + + return texture, cos_map, boundary_map + + def bake_texture(self, colors, elevs, azims, + camera_distance=None, center=None, exp=6, weights=None): + for i in range(len(colors)): + if isinstance(colors[i], Image.Image): + colors[i] = torch.tensor( + np.array( + colors[i]) / 255.0, + device=self.device).float() + if weights is None: + weights = [1.0 for _ in range(colors)] + textures = [] + cos_maps = [] + for color, elev, azim, weight in zip(colors, elevs, azims, weights): + texture, cos_map, _ = self.back_project( + color, elev, azim, camera_distance, center) + cos_map = weight * (cos_map ** exp) + textures.append(texture) + cos_maps.append(cos_map) + + texture_merge, trust_map_merge = self.fast_bake_texture( + textures, cos_maps) + return texture_merge, trust_map_merge + + @torch.no_grad() + def fast_bake_texture(self, textures, cos_maps): + + channel = textures[0].shape[-1] + texture_merge = torch.zeros( + self.texture_size + (channel,)).to(self.device) + trust_map_merge = torch.zeros(self.texture_size + (1,)).to(self.device) + for texture, cos_map in zip(textures, cos_maps): + view_sum = (cos_map > 0).sum() + painted_sum = ((cos_map > 0) * (trust_map_merge > 0)).sum() + if painted_sum / view_sum > 0.99: + continue + texture_merge += texture * cos_map + trust_map_merge += cos_map + texture_merge = texture_merge / torch.clamp(trust_map_merge, min=1E-8) + + return texture_merge, trust_map_merge > 1E-8 + + def uv_inpaint(self, texture, mask): + + if isinstance(texture, torch.Tensor): + texture_np = texture.cpu().numpy() + elif isinstance(texture, np.ndarray): + texture_np = texture + elif isinstance(texture, Image.Image): + texture_np = np.array(texture) / 255.0 + + vtx_pos, pos_idx, vtx_uv, uv_idx = self.get_mesh() + + texture_np, mask = meshVerticeInpaint( + texture_np, mask, vtx_pos, vtx_uv, pos_idx, uv_idx) + + texture_np = cv2.inpaint( + (texture_np * + 255).astype( + np.uint8), + 255 - + mask, + 3, + cv2.INPAINT_NS) + + return texture_np diff --git a/build/lib/hy3dgen/texgen/differentiable_renderer/mesh_utils.py b/build/lib/hy3dgen/texgen/differentiable_renderer/mesh_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..ca0ba1a6145c68651ec033b97e80900cd2c9d7ec --- /dev/null +++ b/build/lib/hy3dgen/texgen/differentiable_renderer/mesh_utils.py @@ -0,0 +1,44 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import trimesh + + +def load_mesh(mesh): + vtx_pos = mesh.vertices if hasattr(mesh, 'vertices') else None + pos_idx = mesh.faces if hasattr(mesh, 'faces') else None + + vtx_uv = mesh.visual.uv if hasattr(mesh.visual, 'uv') else None + uv_idx = mesh.faces if hasattr(mesh, 'faces') else None + + texture_data = None + + return vtx_pos, pos_idx, vtx_uv, uv_idx, texture_data + + +def save_mesh(mesh, texture_data): + material = trimesh.visual.texture.SimpleMaterial(image=texture_data, diffuse=(255, 255, 255)) + texture_visuals = trimesh.visual.TextureVisuals(uv=mesh.visual.uv, image=texture_data, material=material) + mesh.visual = texture_visuals + return mesh diff --git a/build/lib/hy3dgen/texgen/differentiable_renderer/setup.py b/build/lib/hy3dgen/texgen/differentiable_renderer/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..2ea78693fe96ac027742bd752238421c6d83f8fc --- /dev/null +++ b/build/lib/hy3dgen/texgen/differentiable_renderer/setup.py @@ -0,0 +1,48 @@ +from setuptools import setup, Extension +import pybind11 +import sys +import platform + +def get_platform_specific_args(): + system = platform.system().lower() + cpp_std = 'c++14' # Make configurable if needed + + if sys.platform == 'win32': + compile_args = ['/O2', f'/std:{cpp_std}', '/EHsc', '/MP', '/DWIN32_LEAN_AND_MEAN', '/bigobj'] + link_args = [] + extra_includes = [] + elif system == 'linux': + compile_args = ['-O3', f'-std={cpp_std}', '-fPIC', '-Wall', '-Wextra', '-pthread'] + link_args = ['-fPIC', '-pthread'] + extra_includes = [] + elif sys.platform == 'darwin': + compile_args = ['-O3', f'-std={cpp_std}', '-fPIC', '-Wall', '-Wextra', + '-stdlib=libc++', '-mmacosx-version-min=10.14'] + link_args = ['-fPIC', '-stdlib=libc++', '-mmacosx-version-min=10.14', '-dynamiclib'] + extra_includes = [] + else: + raise RuntimeError(f"Unsupported platform: {system}") + + return compile_args, link_args, extra_includes + +extra_compile_args, extra_link_args, platform_includes = get_platform_specific_args() +include_dirs = [pybind11.get_include(), pybind11.get_include(user=True)] +include_dirs.extend(platform_includes) + +ext_modules = [ + Extension( + "mesh_processor", + ["mesh_processor.cpp"], + include_dirs=include_dirs, + language='c++', + extra_compile_args=extra_compile_args, + extra_link_args=extra_link_args, + ), +] + +setup( + name="mesh_processor", + ext_modules=ext_modules, + install_requires=['pybind11>=2.6.0'], + python_requires='>=3.6', +) \ No newline at end of file diff --git a/build/lib/hy3dgen/texgen/hunyuanpaint/__init__.py b/build/lib/hy3dgen/texgen/hunyuanpaint/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e307c3f8c1292da02f308e4b59ef0bcd6fe7305e --- /dev/null +++ b/build/lib/hy3dgen/texgen/hunyuanpaint/__init__.py @@ -0,0 +1,23 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. diff --git a/build/lib/hy3dgen/texgen/hunyuanpaint/pipeline.py b/build/lib/hy3dgen/texgen/hunyuanpaint/pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..436ce34efb8bc40c3df2b3902b7a29dffa39ae91 --- /dev/null +++ b/build/lib/hy3dgen/texgen/hunyuanpaint/pipeline.py @@ -0,0 +1,554 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +from typing import Any, Callable, Dict, List, Optional, Union + +import numpy +import numpy as np +import torch +import torch.distributed +import torch.utils.checkpoint +from PIL import Image +from diffusers import ( + AutoencoderKL, + DiffusionPipeline, + ImagePipelineOutput +) +from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback +from diffusers.image_processor import PipelineImageInput +from diffusers.image_processor import VaeImageProcessor +from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput +from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipeline, retrieve_timesteps, \ + rescale_noise_cfg +from diffusers.schedulers import KarrasDiffusionSchedulers +from diffusers.utils import deprecate +from einops import rearrange +from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer + +from .unet.modules import UNet2p5DConditionModel + + +def to_rgb_image(maybe_rgba: Image.Image): + if maybe_rgba.mode == 'RGB': + return maybe_rgba + elif maybe_rgba.mode == 'RGBA': + rgba = maybe_rgba + img = numpy.random.randint(127, 128, size=[rgba.size[1], rgba.size[0], 3], dtype=numpy.uint8) + img = Image.fromarray(img, 'RGB') + img.paste(rgba, mask=rgba.getchannel('A')) + return img + else: + raise ValueError("Unsupported image type.", maybe_rgba.mode) + + +class HunyuanPaintPipeline(StableDiffusionPipeline): + + def __init__( + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2p5DConditionModel, + scheduler: KarrasDiffusionSchedulers, + feature_extractor: CLIPImageProcessor, + safety_checker=None, + use_torch_compile=False, + ): + DiffusionPipeline.__init__(self) + + safety_checker = None + self.register_modules( + vae=torch.compile(vae) if use_torch_compile else vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + unet=unet, + scheduler=scheduler, + safety_checker=safety_checker, + feature_extractor=torch.compile(feature_extractor) if use_torch_compile else feature_extractor, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + + @torch.no_grad() + def encode_images(self, images): + B = images.shape[0] + images = rearrange(images, 'b n c h w -> (b n) c h w') + + dtype = next(self.vae.parameters()).dtype + images = (images - 0.5) * 2.0 + posterior = self.vae.encode(images.to(dtype)).latent_dist + latents = posterior.sample() * self.vae.config.scaling_factor + + latents = rearrange(latents, '(b n) c h w -> b n c h w', b=B) + return latents + + @torch.no_grad() + def __call__( + self, + image: Image.Image = None, + prompt=None, + negative_prompt='watermark, ugly, deformed, noisy, blurry, low contrast', + *args, + num_images_per_prompt: Optional[int] = 1, + guidance_scale=2.0, + output_type: Optional[str] = "pil", + width=512, + height=512, + num_inference_steps=28, + return_dict=True, + **cached_condition, + ): + if image is None: + raise ValueError("Inputting embeddings not supported for this pipeline. Please pass an image.") + assert not isinstance(image, torch.Tensor) + + image = to_rgb_image(image) + + image_vae = torch.tensor(np.array(image) / 255.0) + image_vae = image_vae.unsqueeze(0).permute(0, 3, 1, 2).unsqueeze(0) + image_vae = image_vae.to(device=self.vae.device, dtype=self.vae.dtype) + + batch_size = image_vae.shape[0] + assert batch_size == 1 + assert num_images_per_prompt == 1 + + ref_latents = self.encode_images(image_vae) + + def convert_pil_list_to_tensor(images): + bg_c = [1., 1., 1.] + images_tensor = [] + for batch_imgs in images: + view_imgs = [] + for pil_img in batch_imgs: + img = numpy.asarray(pil_img, dtype=numpy.float32) / 255. + if img.shape[2] > 3: + alpha = img[:, :, 3:] + img = img[:, :, :3] * alpha + bg_c * (1 - alpha) + img = torch.from_numpy(img).permute(2, 0, 1).unsqueeze(0).contiguous().half().to("cuda") + view_imgs.append(img) + view_imgs = torch.cat(view_imgs, dim=0) + images_tensor.append(view_imgs.unsqueeze(0)) + + images_tensor = torch.cat(images_tensor, dim=0) + return images_tensor + + if "normal_imgs" in cached_condition: + + if isinstance(cached_condition["normal_imgs"], List): + cached_condition["normal_imgs"] = convert_pil_list_to_tensor(cached_condition["normal_imgs"]) + + cached_condition['normal_imgs'] = self.encode_images(cached_condition["normal_imgs"]) + + if "position_imgs" in cached_condition: + + if isinstance(cached_condition["position_imgs"], List): + cached_condition["position_imgs"] = convert_pil_list_to_tensor(cached_condition["position_imgs"]) + + cached_condition["position_imgs"] = self.encode_images(cached_condition["position_imgs"]) + + if 'camera_info_gen' in cached_condition: + camera_info = cached_condition['camera_info_gen'] # B,N + if isinstance(camera_info, List): + camera_info = torch.tensor(camera_info) + camera_info = camera_info.to(image_vae.device).to(torch.int64) + cached_condition['camera_info_gen'] = camera_info + if 'camera_info_ref' in cached_condition: + camera_info = cached_condition['camera_info_ref'] # B,N + if isinstance(camera_info, List): + camera_info = torch.tensor(camera_info) + camera_info = camera_info.to(image_vae.device).to(torch.int64) + cached_condition['camera_info_ref'] = camera_info + + cached_condition['ref_latents'] = ref_latents + + if guidance_scale > 1: + negative_ref_latents = torch.zeros_like(cached_condition['ref_latents']) + cached_condition['ref_latents'] = torch.cat([negative_ref_latents, cached_condition['ref_latents']]) + cached_condition['ref_scale'] = torch.as_tensor([0.0, 1.0]).to(cached_condition['ref_latents']) + if "normal_imgs" in cached_condition: + cached_condition['normal_imgs'] = torch.cat( + (cached_condition['normal_imgs'], cached_condition['normal_imgs'])) + + if "position_imgs" in cached_condition: + cached_condition['position_imgs'] = torch.cat( + (cached_condition['position_imgs'], cached_condition['position_imgs'])) + + if 'position_maps' in cached_condition: + cached_condition['position_maps'] = torch.cat( + (cached_condition['position_maps'], cached_condition['position_maps'])) + + if 'camera_info_gen' in cached_condition: + cached_condition['camera_info_gen'] = torch.cat( + (cached_condition['camera_info_gen'], cached_condition['camera_info_gen'])) + if 'camera_info_ref' in cached_condition: + cached_condition['camera_info_ref'] = torch.cat( + (cached_condition['camera_info_ref'], cached_condition['camera_info_ref'])) + + prompt_embeds = self.unet.learned_text_clip_gen.repeat(num_images_per_prompt, 1, 1) + negative_prompt_embeds = torch.zeros_like(prompt_embeds) + + latents: torch.Tensor = self.denoise( + None, + *args, + cross_attention_kwargs=None, + guidance_scale=guidance_scale, + num_images_per_prompt=num_images_per_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + num_inference_steps=num_inference_steps, + output_type='latent', + width=width, + height=height, + **cached_condition + ).images + + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0] + else: + image = latents + + image = self.image_processor.postprocess(image, output_type=output_type) + if not return_dict: + return (image,) + + return ImagePipelineOutput(images=image) + + def denoise( + self, + prompt: Union[str, List[str]] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + timesteps: List[int] = None, + sigmas: List[float] = None, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + guidance_rescale: float = 0.0, + clip_skip: Optional[int] = None, + callback_on_step_end: Optional[ + Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks] + ] = None, + callback_on_step_end_tensor_inputs: List[str] = ["latents"], + **kwargs, + ): + r""" + The call function to the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. + height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): + The width in pixels of the generated image. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + timesteps (`List[int]`, *optional*): + Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument + in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is + passed will be used. Must be in descending order. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. + guidance_scale (`float`, *optional*, defaults to 7.5): + A higher guidance scale value encourages the model to generate images closely linked to the text + `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide what to not include in image generation. If not defined, you need to + pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies + to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make + generation deterministic. + latents (`torch.Tensor`, *optional*): + Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor is generated by sampling using the supplied random `generator`. + prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not + provided, text embeddings are generated from the `prompt` input argument. + negative_prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If + not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. + ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generated image. Choose between `PIL.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + plain tuple. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in + [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + guidance_rescale (`float`, *optional*, defaults to 0.0): + Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are + Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when + using zero terminal SNR. + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. + callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*): + A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of + each denoising step during the inference. with the following arguments: `callback_on_step_end(self: + DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a + list of all tensors as specified by `callback_on_step_end_tensor_inputs`. + callback_on_step_end_tensor_inputs (`List`, *optional*): + The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list + will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the + `._callback_tensor_inputs` attribute of your pipeline class. + + Examples: + + Returns: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: + If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned, + otherwise a `tuple` is returned where the first element is a list with the generated images and the + second element is a list of `bool`s indicating whether the corresponding generated image contains + "not-safe-for-work" (nsfw) content. + """ + + callback = kwargs.pop("callback", None) + callback_steps = kwargs.pop("callback_steps", None) + + if callback is not None: + deprecate( + "callback", + "1.0.0", + "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`", + ) + if callback_steps is not None: + deprecate( + "callback_steps", + "1.0.0", + "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`", + ) + + if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): + callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs + + # 0. Default height and width to unet + height = height or self.unet.config.sample_size * self.vae_scale_factor + width = width or self.unet.config.sample_size * self.vae_scale_factor + # to deal with lora scaling and other possible forward hooks + + # 1. Check inputs. Raise error if not correct + self.check_inputs( + prompt, + height, + width, + callback_steps, + negative_prompt, + prompt_embeds, + negative_prompt_embeds, + ip_adapter_image, + ip_adapter_image_embeds, + callback_on_step_end_tensor_inputs, + ) + + self._guidance_scale = guidance_scale + self._guidance_rescale = guidance_rescale + self._clip_skip = clip_skip + self._cross_attention_kwargs = cross_attention_kwargs + self._interrupt = False + + # 2. Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + device = self._execution_device + + # 3. Encode input prompt + lora_scale = ( + self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None + ) + + prompt_embeds, negative_prompt_embeds = self.encode_prompt( + prompt, + device, + num_images_per_prompt, + self.do_classifier_free_guidance, + negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + lora_scale=lora_scale, + clip_skip=self.clip_skip, + ) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + if self.do_classifier_free_guidance: + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: + image_embeds = self.prepare_ip_adapter_image_embeds( + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_images_per_prompt, + self.do_classifier_free_guidance, + ) + + # 4. Prepare timesteps + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, num_inference_steps, device, timesteps, sigmas + ) + assert num_images_per_prompt == 1 + # 5. Prepare latent variables + num_channels_latents = self.unet.config.in_channels + latents = self.prepare_latents( + batch_size * kwargs['num_in_batch'], # num_images_per_prompt, + num_channels_latents, + height, + width, + prompt_embeds.dtype, + device, + generator, + latents, + ) + + # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + # 6.1 Add image embeds for IP-Adapter + added_cond_kwargs = ( + {"image_embeds": image_embeds} + if (ip_adapter_image is not None or ip_adapter_image_embeds is not None) + else None + ) + + # 6.2 Optionally get Guidance Scale Embedding + timestep_cond = None + if self.unet.config.time_cond_proj_dim is not None: + guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt) + timestep_cond = self.get_guidance_scale_embedding( + guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim + ).to(device=device, dtype=latents.dtype) + + # 7. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + self._num_timesteps = len(timesteps) + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + if self.interrupt: + continue + + # expand the latents if we are doing classifier free guidance + latents = rearrange(latents, '(b n) c h w -> b n c h w', n=kwargs['num_in_batch']) + latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents + latent_model_input = rearrange(latent_model_input, 'b n c h w -> (b n) c h w') + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + latent_model_input = rearrange(latent_model_input, '(b n) c h w ->b n c h w', n=kwargs['num_in_batch']) + + # predict the noise residual + + noise_pred = self.unet( + latent_model_input, + t, + encoder_hidden_states=prompt_embeds, + timestep_cond=timestep_cond, + cross_attention_kwargs=self.cross_attention_kwargs, + added_cond_kwargs=added_cond_kwargs, + return_dict=False, **kwargs + )[0] + latents = rearrange(latents, 'b n c h w -> (b n) c h w') + # perform guidance + if self.do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond) + + if self.do_classifier_free_guidance and self.guidance_rescale > 0.0: + # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf + noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale) + + # compute the previous noisy sample x_t -> x_t-1 + latents = \ + self.scheduler.step(noise_pred, t, latents[:, :num_channels_latents, :, :], **extra_step_kwargs, + return_dict=False)[0] + + if callback_on_step_end is not None: + callback_kwargs = {} + for k in callback_on_step_end_tensor_inputs: + callback_kwargs[k] = locals()[k] + callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) + + latents = callback_outputs.pop("latents", latents) + prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) + negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds) + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + step_idx = i // getattr(self.scheduler, "order", 1) + callback(step_idx, t, latents) + + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[ + 0 + ] + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + else: + image = latents + has_nsfw_concept = None + + if has_nsfw_concept is None: + do_denormalize = [True] * image.shape[0] + else: + do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] + + image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) + + # Offload all models + self.maybe_free_model_hooks() + + if not return_dict: + return (image, has_nsfw_concept) + + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/build/lib/hy3dgen/texgen/hunyuanpaint/unet/__init__.py b/build/lib/hy3dgen/texgen/hunyuanpaint/unet/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e307c3f8c1292da02f308e4b59ef0bcd6fe7305e --- /dev/null +++ b/build/lib/hy3dgen/texgen/hunyuanpaint/unet/__init__.py @@ -0,0 +1,23 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. diff --git a/build/lib/hy3dgen/texgen/hunyuanpaint/unet/modules.py b/build/lib/hy3dgen/texgen/hunyuanpaint/unet/modules.py new file mode 100644 index 0000000000000000000000000000000000000000..5d16bc6b6bb1ebc72c602dcb298d122429fe847d --- /dev/null +++ b/build/lib/hy3dgen/texgen/hunyuanpaint/unet/modules.py @@ -0,0 +1,440 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + + +import copy +import json +import os +from typing import Any, Dict, Optional + +import torch +import torch.nn as nn +from diffusers.models import UNet2DConditionModel +from diffusers.models.attention_processor import Attention +from diffusers.models.transformers.transformer_2d import BasicTransformerBlock +from einops import rearrange + + +def _chunked_feed_forward(ff: nn.Module, hidden_states: torch.Tensor, chunk_dim: int, chunk_size: int): + # "feed_forward_chunk_size" can be used to save memory + if hidden_states.shape[chunk_dim] % chunk_size != 0: + raise ValueError( + f"`hidden_states` dimension to be chunked: {hidden_states.shape[chunk_dim]} has to be divisible by chunk size: {chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`." + ) + + num_chunks = hidden_states.shape[chunk_dim] // chunk_size + ff_output = torch.cat( + [ff(hid_slice) for hid_slice in hidden_states.chunk(num_chunks, dim=chunk_dim)], + dim=chunk_dim, + ) + return ff_output + + +class Basic2p5DTransformerBlock(torch.nn.Module): + def __init__(self, transformer: BasicTransformerBlock, layer_name, use_ma=True, use_ra=True) -> None: + super().__init__() + self.transformer = transformer + self.layer_name = layer_name + self.use_ma = use_ma + self.use_ra = use_ra + + # multiview attn + if self.use_ma: + self.attn_multiview = Attention( + query_dim=self.dim, + heads=self.num_attention_heads, + dim_head=self.attention_head_dim, + dropout=self.dropout, + bias=self.attention_bias, + cross_attention_dim=None, + upcast_attention=self.attn1.upcast_attention, + out_bias=True, + ) + + # ref attn + if self.use_ra: + self.attn_refview = Attention( + query_dim=self.dim, + heads=self.num_attention_heads, + dim_head=self.attention_head_dim, + dropout=self.dropout, + bias=self.attention_bias, + cross_attention_dim=None, + upcast_attention=self.attn1.upcast_attention, + out_bias=True, + ) + + def __getattr__(self, name: str): + try: + return super().__getattr__(name) + except AttributeError: + return getattr(self.transformer, name) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + timestep: Optional[torch.LongTensor] = None, + cross_attention_kwargs: Dict[str, Any] = None, + class_labels: Optional[torch.LongTensor] = None, + added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None, + ) -> torch.Tensor: + + # Notice that normalization is always applied before the real computation in the following blocks. + # 0. Self-Attention + batch_size = hidden_states.shape[0] + + cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {} + num_in_batch = cross_attention_kwargs.pop('num_in_batch', 1) + mode = cross_attention_kwargs.pop('mode', None) + mva_scale = cross_attention_kwargs.pop('mva_scale', 1.0) + ref_scale = cross_attention_kwargs.pop('ref_scale', 1.0) + condition_embed_dict = cross_attention_kwargs.pop("condition_embed_dict", None) + + if self.norm_type == "ada_norm": + norm_hidden_states = self.norm1(hidden_states, timestep) + elif self.norm_type == "ada_norm_zero": + norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1( + hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype + ) + elif self.norm_type in ["layer_norm", "layer_norm_i2vgen"]: + norm_hidden_states = self.norm1(hidden_states) + elif self.norm_type == "ada_norm_continuous": + norm_hidden_states = self.norm1(hidden_states, added_cond_kwargs["pooled_text_emb"]) + elif self.norm_type == "ada_norm_single": + shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = ( + self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1) + ).chunk(6, dim=1) + norm_hidden_states = self.norm1(hidden_states) + norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa + else: + raise ValueError("Incorrect norm used") + + if self.pos_embed is not None: + norm_hidden_states = self.pos_embed(norm_hidden_states) + + # 1. Prepare GLIGEN inputs + cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {} + gligen_kwargs = cross_attention_kwargs.pop("gligen", None) + + attn_output = self.attn1( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None, + attention_mask=attention_mask, + **cross_attention_kwargs, + ) + + if self.norm_type == "ada_norm_zero": + attn_output = gate_msa.unsqueeze(1) * attn_output + elif self.norm_type == "ada_norm_single": + attn_output = gate_msa * attn_output + + hidden_states = attn_output + hidden_states + if hidden_states.ndim == 4: + hidden_states = hidden_states.squeeze(1) + + # 1.2 Reference Attention + if 'w' in mode: + condition_embed_dict[self.layer_name] = rearrange(norm_hidden_states, '(b n) l c -> b (n l) c', + n=num_in_batch) # B, (N L), C + + if 'r' in mode and self.use_ra: + condition_embed = condition_embed_dict[self.layer_name].unsqueeze(1).repeat(1, num_in_batch, 1, + 1) # B N L C + condition_embed = rearrange(condition_embed, 'b n l c -> (b n) l c') + + attn_output = self.attn_refview( + norm_hidden_states, + encoder_hidden_states=condition_embed, + attention_mask=None, + **cross_attention_kwargs + ) + ref_scale_timing = ref_scale + if isinstance(ref_scale, torch.Tensor): + ref_scale_timing = ref_scale.unsqueeze(1).repeat(1, num_in_batch).view(-1) + for _ in range(attn_output.ndim - 1): + ref_scale_timing = ref_scale_timing.unsqueeze(-1) + hidden_states = ref_scale_timing * attn_output + hidden_states + if hidden_states.ndim == 4: + hidden_states = hidden_states.squeeze(1) + + # 1.3 Multiview Attention + if num_in_batch > 1 and self.use_ma: + multivew_hidden_states = rearrange(norm_hidden_states, '(b n) l c -> b (n l) c', n=num_in_batch) + + attn_output = self.attn_multiview( + multivew_hidden_states, + encoder_hidden_states=multivew_hidden_states, + **cross_attention_kwargs + ) + + attn_output = rearrange(attn_output, 'b (n l) c -> (b n) l c', n=num_in_batch) + + hidden_states = mva_scale * attn_output + hidden_states + if hidden_states.ndim == 4: + hidden_states = hidden_states.squeeze(1) + + # 1.2 GLIGEN Control + if gligen_kwargs is not None: + hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"]) + + # 3. Cross-Attention + if self.attn2 is not None: + if self.norm_type == "ada_norm": + norm_hidden_states = self.norm2(hidden_states, timestep) + elif self.norm_type in ["ada_norm_zero", "layer_norm", "layer_norm_i2vgen"]: + norm_hidden_states = self.norm2(hidden_states) + elif self.norm_type == "ada_norm_single": + # For PixArt norm2 isn't applied here: + # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103 + norm_hidden_states = hidden_states + elif self.norm_type == "ada_norm_continuous": + norm_hidden_states = self.norm2(hidden_states, added_cond_kwargs["pooled_text_emb"]) + else: + raise ValueError("Incorrect norm") + + if self.pos_embed is not None and self.norm_type != "ada_norm_single": + norm_hidden_states = self.pos_embed(norm_hidden_states) + + attn_output = self.attn2( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + **cross_attention_kwargs, + ) + + hidden_states = attn_output + hidden_states + + # 4. Feed-forward + # i2vgen doesn't have this norm 🤷‍♂️ + if self.norm_type == "ada_norm_continuous": + norm_hidden_states = self.norm3(hidden_states, added_cond_kwargs["pooled_text_emb"]) + elif not self.norm_type == "ada_norm_single": + norm_hidden_states = self.norm3(hidden_states) + + if self.norm_type == "ada_norm_zero": + norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None] + + if self.norm_type == "ada_norm_single": + norm_hidden_states = self.norm2(hidden_states) + norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp + + if self._chunk_size is not None: + # "feed_forward_chunk_size" can be used to save memory + ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size) + else: + ff_output = self.ff(norm_hidden_states) + + if self.norm_type == "ada_norm_zero": + ff_output = gate_mlp.unsqueeze(1) * ff_output + elif self.norm_type == "ada_norm_single": + ff_output = gate_mlp * ff_output + + hidden_states = ff_output + hidden_states + if hidden_states.ndim == 4: + hidden_states = hidden_states.squeeze(1) + + return hidden_states + + +class UNet2p5DConditionModel(torch.nn.Module): + def __init__(self, unet: UNet2DConditionModel) -> None: + super().__init__() + self.unet = unet + + self.use_ma = True + self.use_ra = True + self.use_camera_embedding = True + self.use_dual_stream = True + + if self.use_dual_stream: + self.unet_dual = copy.deepcopy(unet) + self.init_attention(self.unet_dual) + self.init_attention(self.unet, use_ma=self.use_ma, use_ra=self.use_ra) + self.init_condition() + self.init_camera_embedding() + + @staticmethod + def from_pretrained(pretrained_model_name_or_path, **kwargs): + torch_dtype = kwargs.pop('torch_dtype', torch.float32) + config_path = os.path.join(pretrained_model_name_or_path, 'config.json') + unet_ckpt_path = os.path.join(pretrained_model_name_or_path, 'diffusion_pytorch_model.bin') + with open(config_path, 'r', encoding='utf-8') as file: + config = json.load(file) + unet = UNet2DConditionModel(**config) + unet = UNet2p5DConditionModel(unet) + unet_ckpt = torch.load(unet_ckpt_path, map_location='cpu', weights_only=True) + unet.load_state_dict(unet_ckpt, strict=True) + unet = unet.to(torch_dtype) + return unet + + def init_condition(self): + self.unet.conv_in = torch.nn.Conv2d( + 12, + self.unet.conv_in.out_channels, + kernel_size=self.unet.conv_in.kernel_size, + stride=self.unet.conv_in.stride, + padding=self.unet.conv_in.padding, + dilation=self.unet.conv_in.dilation, + groups=self.unet.conv_in.groups, + bias=self.unet.conv_in.bias is not None) + + self.unet.learned_text_clip_gen = nn.Parameter(torch.randn(1, 77, 1024)) + self.unet.learned_text_clip_ref = nn.Parameter(torch.randn(1, 77, 1024)) + + def init_camera_embedding(self): + + if self.use_camera_embedding: + time_embed_dim = 1280 + self.max_num_ref_image = 5 + self.max_num_gen_image = 12 * 3 + 4 * 2 + self.unet.class_embedding = nn.Embedding(self.max_num_ref_image + self.max_num_gen_image, time_embed_dim) + + def init_attention(self, unet, use_ma=False, use_ra=False): + + for down_block_i, down_block in enumerate(unet.down_blocks): + if hasattr(down_block, "has_cross_attention") and down_block.has_cross_attention: + for attn_i, attn in enumerate(down_block.attentions): + for transformer_i, transformer in enumerate(attn.transformer_blocks): + if isinstance(transformer, BasicTransformerBlock): + attn.transformer_blocks[transformer_i] = Basic2p5DTransformerBlock(transformer, + f'down_{down_block_i}_{attn_i}_{transformer_i}', + use_ma, use_ra) + + if hasattr(unet.mid_block, "has_cross_attention") and unet.mid_block.has_cross_attention: + for attn_i, attn in enumerate(unet.mid_block.attentions): + for transformer_i, transformer in enumerate(attn.transformer_blocks): + if isinstance(transformer, BasicTransformerBlock): + attn.transformer_blocks[transformer_i] = Basic2p5DTransformerBlock(transformer, + f'mid_{attn_i}_{transformer_i}', + use_ma, use_ra) + + for up_block_i, up_block in enumerate(unet.up_blocks): + if hasattr(up_block, "has_cross_attention") and up_block.has_cross_attention: + for attn_i, attn in enumerate(up_block.attentions): + for transformer_i, transformer in enumerate(attn.transformer_blocks): + if isinstance(transformer, BasicTransformerBlock): + attn.transformer_blocks[transformer_i] = Basic2p5DTransformerBlock(transformer, + f'up_{up_block_i}_{attn_i}_{transformer_i}', + use_ma, use_ra) + + def __getattr__(self, name: str): + try: + return super().__getattr__(name) + except AttributeError: + return getattr(self.unet, name) + + def forward( + self, sample, timestep, encoder_hidden_states, + *args, down_intrablock_additional_residuals=None, + down_block_res_samples=None, mid_block_res_sample=None, + **cached_condition, + ): + B, N_gen, _, H, W = sample.shape + assert H == W + + if self.use_camera_embedding: + camera_info_gen = cached_condition['camera_info_gen'] + self.max_num_ref_image + camera_info_gen = rearrange(camera_info_gen, 'b n -> (b n)') + else: + camera_info_gen = None + + sample = [sample] + if 'normal_imgs' in cached_condition: + sample.append(cached_condition["normal_imgs"]) + if 'position_imgs' in cached_condition: + sample.append(cached_condition["position_imgs"]) + sample = torch.cat(sample, dim=2) + + sample = rearrange(sample, 'b n c h w -> (b n) c h w') + + encoder_hidden_states_gen = encoder_hidden_states.unsqueeze(1).repeat(1, N_gen, 1, 1) + encoder_hidden_states_gen = rearrange(encoder_hidden_states_gen, 'b n l c -> (b n) l c') + + if self.use_ra: + if 'condition_embed_dict' in cached_condition: + condition_embed_dict = cached_condition['condition_embed_dict'] + else: + condition_embed_dict = {} + ref_latents = cached_condition['ref_latents'] + N_ref = ref_latents.shape[1] + if self.use_camera_embedding: + camera_info_ref = cached_condition['camera_info_ref'] + camera_info_ref = rearrange(camera_info_ref, 'b n -> (b n)') + else: + camera_info_ref = None + + ref_latents = rearrange(ref_latents, 'b n c h w -> (b n) c h w') + + encoder_hidden_states_ref = self.unet.learned_text_clip_ref.unsqueeze(1).repeat(B, N_ref, 1, 1) + encoder_hidden_states_ref = rearrange(encoder_hidden_states_ref, 'b n l c -> (b n) l c') + + noisy_ref_latents = ref_latents + timestep_ref = 0 + + if self.use_dual_stream: + unet_ref = self.unet_dual + else: + unet_ref = self.unet + unet_ref( + noisy_ref_latents, timestep_ref, + encoder_hidden_states=encoder_hidden_states_ref, + class_labels=camera_info_ref, + # **kwargs + return_dict=False, + cross_attention_kwargs={ + 'mode': 'w', 'num_in_batch': N_ref, + 'condition_embed_dict': condition_embed_dict}, + ) + cached_condition['condition_embed_dict'] = condition_embed_dict + else: + condition_embed_dict = None + + mva_scale = cached_condition.get('mva_scale', 1.0) + ref_scale = cached_condition.get('ref_scale', 1.0) + + return self.unet( + sample, timestep, + encoder_hidden_states_gen, *args, + class_labels=camera_info_gen, + down_intrablock_additional_residuals=[ + sample.to(dtype=self.unet.dtype) for sample in down_intrablock_additional_residuals + ] if down_intrablock_additional_residuals is not None else None, + down_block_additional_residuals=[ + sample.to(dtype=self.unet.dtype) for sample in down_block_res_samples + ] if down_block_res_samples is not None else None, + mid_block_additional_residual=( + mid_block_res_sample.to(dtype=self.unet.dtype) + if mid_block_res_sample is not None else None + ), + return_dict=False, + cross_attention_kwargs={ + 'mode': 'r', 'num_in_batch': N_gen, + 'condition_embed_dict': condition_embed_dict, + 'mva_scale': mva_scale, + 'ref_scale': ref_scale, + }, + ) diff --git a/build/lib/hy3dgen/texgen/pipelines.py b/build/lib/hy3dgen/texgen/pipelines.py new file mode 100644 index 0000000000000000000000000000000000000000..ebb95ea41ed0e39b3b48d1fe30ad22e4dd5f4a41 --- /dev/null +++ b/build/lib/hy3dgen/texgen/pipelines.py @@ -0,0 +1,227 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + + +import logging +import os + +import numpy as np +import torch +from PIL import Image + +from .differentiable_renderer.mesh_render import MeshRender +from .utils.dehighlight_utils import Light_Shadow_Remover +from .utils.multiview_utils import Multiview_Diffusion_Net +from .utils.uv_warp_utils import mesh_uv_wrap + +logger = logging.getLogger(__name__) + + +class Hunyuan3DTexGenConfig: + + def __init__(self, light_remover_ckpt_path, multiview_ckpt_path): + self.device = 'cpu' + self.light_remover_ckpt_path = light_remover_ckpt_path + self.multiview_ckpt_path = multiview_ckpt_path + + self.candidate_camera_azims = [0, 90, 180, 270, 0, 180] + self.candidate_camera_elevs = [0, 0, 0, 0, 90, -90] + self.candidate_view_weights = [1, 0.1, 0.5, 0.1, 0.05, 0.05] + + self.render_size = 2048 + self.texture_size = 1024 + self.bake_exp = 4 + self.merge_method = 'fast' + + +class Hunyuan3DPaintPipeline: + @classmethod + def from_pretrained(cls, model_path): + original_model_path = model_path + if not os.path.exists(model_path): + # try local path + base_dir = os.environ.get('HY3DGEN_MODELS', '~/.cache/hy3dgen') + model_path = os.path.expanduser(os.path.join(base_dir, model_path)) + + delight_model_path = os.path.join(model_path, 'hunyuan3d-delight-v2-0') + multiview_model_path = os.path.join(model_path, 'hunyuan3d-paint-v2-0') + + if not os.path.exists(delight_model_path) or not os.path.exists(multiview_model_path): + try: + import huggingface_hub + # download from huggingface + model_path = huggingface_hub.snapshot_download(repo_id=original_model_path) + delight_model_path = os.path.join(model_path, 'hunyuan3d-delight-v2-0') + multiview_model_path = os.path.join(model_path, 'hunyuan3d-paint-v2-0') + return cls(Hunyuan3DTexGenConfig(delight_model_path, multiview_model_path)) + except ImportError: + logger.warning( + "You need to install HuggingFace Hub to load models from the hub." + ) + raise RuntimeError(f"Model path {model_path} not found") + else: + return cls(Hunyuan3DTexGenConfig(delight_model_path, multiview_model_path)) + + raise FileNotFoundError(f"Model path {original_model_path} not found and we could not find it at huggingface") + + def __init__(self, config): + self.config = config + self.models = {} + self.render = MeshRender( + default_resolution=self.config.render_size, + texture_size=self.config.texture_size) + + self.load_models() + + def load_models(self): + # empty cude cache + torch.cuda.empty_cache() + # Load model + self.models['delight_model'] = Light_Shadow_Remover(self.config) + self.models['multiview_model'] = Multiview_Diffusion_Net(self.config) + + def render_normal_multiview(self, camera_elevs, camera_azims, use_abs_coor=True): + normal_maps = [] + for elev, azim in zip(camera_elevs, camera_azims): + normal_map = self.render.render_normal( + elev, azim, use_abs_coor=use_abs_coor, return_type='pl') + normal_maps.append(normal_map) + + return normal_maps + + def render_position_multiview(self, camera_elevs, camera_azims): + position_maps = [] + for elev, azim in zip(camera_elevs, camera_azims): + position_map = self.render.render_position( + elev, azim, return_type='pl') + position_maps.append(position_map) + + return position_maps + + def bake_from_multiview(self, views, camera_elevs, + camera_azims, view_weights, method='graphcut'): + project_textures, project_weighted_cos_maps = [], [] + project_boundary_maps = [] + for view, camera_elev, camera_azim, weight in zip( + views, camera_elevs, camera_azims, view_weights): + project_texture, project_cos_map, project_boundary_map = self.render.back_project( + view, camera_elev, camera_azim) + project_cos_map = weight * (project_cos_map ** self.config.bake_exp) + project_textures.append(project_texture) + project_weighted_cos_maps.append(project_cos_map) + project_boundary_maps.append(project_boundary_map) + + if method == 'fast': + texture, ori_trust_map = self.render.fast_bake_texture( + project_textures, project_weighted_cos_maps) + else: + raise f'no method {method}' + return texture, ori_trust_map > 1E-8 + + def texture_inpaint(self, texture, mask): + + texture_np = self.render.uv_inpaint(texture, mask) + texture = torch.tensor(texture_np / 255).float().to(texture.device) + + return texture + + def recenter_image(self, image, border_ratio=0.2): + if image.mode == 'RGB': + return image + elif image.mode == 'L': + image = image.convert('RGB') + return image + + alpha_channel = np.array(image)[:, :, 3] + non_zero_indices = np.argwhere(alpha_channel > 0) + if non_zero_indices.size == 0: + raise ValueError("Image is fully transparent") + + min_row, min_col = non_zero_indices.min(axis=0) + max_row, max_col = non_zero_indices.max(axis=0) + + cropped_image = image.crop((min_col, min_row, max_col + 1, max_row + 1)) + + width, height = cropped_image.size + border_width = int(width * border_ratio) + border_height = int(height * border_ratio) + + new_width = width + 2 * border_width + new_height = height + 2 * border_height + + square_size = max(new_width, new_height) + + new_image = Image.new('RGBA', (square_size, square_size), (255, 255, 255, 0)) + + paste_x = (square_size - new_width) // 2 + border_width + paste_y = (square_size - new_height) // 2 + border_height + + new_image.paste(cropped_image, (paste_x, paste_y)) + return new_image + + @torch.no_grad() + def __call__(self, mesh, image): + + if isinstance(image, str): + image_prompt = Image.open(image) + else: + image_prompt = image + + image_prompt = self.recenter_image(image_prompt) + + image_prompt = self.models['delight_model'](image_prompt) + + mesh = mesh_uv_wrap(mesh) + + self.render.load_mesh(mesh) + + selected_camera_elevs, selected_camera_azims, selected_view_weights = \ + self.config.candidate_camera_elevs, self.config.candidate_camera_azims, self.config.candidate_view_weights + + normal_maps = self.render_normal_multiview( + selected_camera_elevs, selected_camera_azims, use_abs_coor=True) + position_maps = self.render_position_multiview( + selected_camera_elevs, selected_camera_azims) + + camera_info = [(((azim // 30) + 9) % 12) // {-20: 1, 0: 1, 20: 1, -90: 3, 90: 3}[ + elev] + {-20: 0, 0: 12, 20: 24, -90: 36, 90: 40}[elev] for azim, elev in + zip(selected_camera_azims, selected_camera_elevs)] + multiviews = self.models['multiview_model'](image_prompt, normal_maps + position_maps, camera_info) + + for i in range(len(multiviews)): + multiviews[i] = multiviews[i].resize( + (self.config.render_size, self.config.render_size)) + + texture, mask = self.bake_from_multiview(multiviews, + selected_camera_elevs, selected_camera_azims, selected_view_weights, + method=self.config.merge_method) + + mask_np = (mask.squeeze(-1).cpu().numpy() * 255).astype(np.uint8) + + texture = self.texture_inpaint(texture, mask_np) + + self.render.set_texture(texture) + textured_mesh = self.render.save_mesh() + + return textured_mesh diff --git a/build/lib/hy3dgen/texgen/utils/__init__.py b/build/lib/hy3dgen/texgen/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e307c3f8c1292da02f308e4b59ef0bcd6fe7305e --- /dev/null +++ b/build/lib/hy3dgen/texgen/utils/__init__.py @@ -0,0 +1,23 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. diff --git a/build/lib/hy3dgen/texgen/utils/alignImg4Tex_utils.py b/build/lib/hy3dgen/texgen/utils/alignImg4Tex_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..0a09c17cfe1a3f1ac850688e96b66341f0226418 --- /dev/null +++ b/build/lib/hy3dgen/texgen/utils/alignImg4Tex_utils.py @@ -0,0 +1,132 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + + +import torch +from diffusers import EulerAncestralDiscreteScheduler +from diffusers import StableDiffusionControlNetPipeline, StableDiffusionXLControlNetImg2ImgPipeline, ControlNetModel, \ + AutoencoderKL + + +class Img2img_Control_Ip_adapter: + def __init__(self, device): + controlnet = ControlNetModel.from_pretrained('lllyasviel/control_v11f1p_sd15_depth', torch_dtype=torch.float16, + variant="fp16", use_safetensors=True) + pipe = StableDiffusionControlNetPipeline.from_pretrained( + 'runwayml/stable-diffusion-v1-5', controlnet=controlnet, torch_dtype=torch.float16, use_safetensors=True + ) + pipe.load_ip_adapter('h94/IP-Adapter', subfolder="models", weight_name="ip-adapter-plus_sd15.safetensors") + pipe.set_ip_adapter_scale(0.7) + + pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config) + # pipe.enable_model_cpu_offload() + self.pipe = pipe.to(device) + + def __call__( + self, + prompt, + control_image, + ip_adapter_image, + negative_prompt, + height=512, + width=512, + num_inference_steps=20, + guidance_scale=8.0, + controlnet_conditioning_scale=1.0, + output_type="pil", + **kwargs, + ): + results = self.pipe( + prompt=prompt, + negative_prompt=negative_prompt, + image=control_image, + ip_adapter_image=ip_adapter_image, + generator=torch.manual_seed(42), + seed=42, + num_inference_steps=num_inference_steps, + guidance_scale=guidance_scale, + controlnet_conditioning_scale=controlnet_conditioning_scale, + strength=1, + # clip_skip=2, + height=height, + width=width, + output_type=output_type, + **kwargs, + ).images[0] + return results + + +################################################################ + +class HesModel: + def __init__(self, ): + controlnet_depth = ControlNetModel.from_pretrained( + 'diffusers/controlnet-depth-sdxl-1.0', + torch_dtype=torch.float16, + variant="fp16", + use_safetensors=True + ) + self.pipe = StableDiffusionXLControlNetImg2ImgPipeline.from_pretrained( + 'stabilityai/stable-diffusion-xl-base-1.0', + torch_dtype=torch.float16, + variant="fp16", + controlnet=controlnet_depth, + use_safetensors=True, + ) + self.pipe.vae = AutoencoderKL.from_pretrained( + 'madebyollin/sdxl-vae-fp16-fix', + torch_dtype=torch.float16 + ) + + self.pipe.load_ip_adapter('h94/IP-Adapter', subfolder="sdxl_models", weight_name="ip-adapter_sdxl.safetensors") + self.pipe.set_ip_adapter_scale(0.7) + self.pipe.to("cuda") + + def __call__(self, + init_image, + control_image, + ip_adapter_image=None, + prompt='3D image', + negative_prompt='2D image', + seed=42, + strength=0.8, + num_inference_steps=40, + guidance_scale=7.5, + controlnet_conditioning_scale=0.5, + **kwargs + ): + image = self.pipe( + prompt=prompt, + image=init_image, + control_image=control_image, + ip_adapter_image=ip_adapter_image, + negative_prompt=negative_prompt, + num_inference_steps=num_inference_steps, + guidance_scale=guidance_scale, + strength=strength, + controlnet_conditioning_scale=controlnet_conditioning_scale, + seed=seed, + **kwargs + ).images[0] + return image diff --git a/build/lib/hy3dgen/texgen/utils/counter_utils.py b/build/lib/hy3dgen/texgen/utils/counter_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..e0374fc327ad2127ec84bb0c267c19a3b9c8d738 --- /dev/null +++ b/build/lib/hy3dgen/texgen/utils/counter_utils.py @@ -0,0 +1,58 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + + +class RunningStats(): + def __init__(self) -> None: + self.count = 0 + self.sum = 0 + self.mean = 0 + self.min = None + self.max = None + + def add_value(self, value): + self.count += 1 + self.sum += value + self.mean = self.sum / self.count + + if self.min is None or value < self.min: + self.min = value + + if self.max is None or value > self.max: + self.max = value + + def get_count(self): + return self.count + + def get_sum(self): + return self.sum + + def get_mean(self): + return self.mean + + def get_min(self): + return self.min + + def get_max(self): + return self.max diff --git a/build/lib/hy3dgen/texgen/utils/dehighlight_utils.py b/build/lib/hy3dgen/texgen/utils/dehighlight_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..089076b08f712ec0db882835f422183fd7f94457 --- /dev/null +++ b/build/lib/hy3dgen/texgen/utils/dehighlight_utils.py @@ -0,0 +1,84 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import cv2 +import numpy as np +import torch +from PIL import Image +from diffusers import StableDiffusionInstructPix2PixPipeline, EulerAncestralDiscreteScheduler + + +class Light_Shadow_Remover(): + def __init__(self, config): + self.device = config.device + self.cfg_image = 1.5 + self.cfg_text = 1.0 + + pipeline = StableDiffusionInstructPix2PixPipeline.from_pretrained( + config.light_remover_ckpt_path, + torch_dtype=torch.float16, + safety_checker=None, + ) + pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(pipeline.scheduler.config) + pipeline.set_progress_bar_config(disable=True) + + # self.pipeline = pipeline.to(self.device, torch.float16) + self.pipeline = pipeline # Needed to avoid displaying the warning + @torch.no_grad() + def __call__(self, image): + + image = image.resize((512, 512)) + + if image.mode == 'RGBA': + image_array = np.array(image) + alpha_channel = image_array[:, :, 3] + erosion_size = 3 + kernel = np.ones((erosion_size, erosion_size), np.uint8) + alpha_channel = cv2.erode(alpha_channel, kernel, iterations=1) + image_array[alpha_channel == 0, :3] = 255 + image_array[:, :, 3] = alpha_channel + image = Image.fromarray(image_array) + + image_tensor = torch.tensor(np.array(image) / 255.0).to(self.device) + alpha = image_tensor[:, :, 3:] + rgb_target = image_tensor[:, :, :3] + else: + image_tensor = torch.tensor(np.array(image) / 255.0).to(self.device) + alpha = torch.ones_like(image_tensor)[:, :, :1] + rgb_target = image_tensor[:, :, :3] + + image = image.convert('RGB') + + image = self.pipeline( + prompt="", + image=image, + generator=torch.manual_seed(42), + height=512, + width=512, + num_inference_steps=50, + image_guidance_scale=self.cfg_image, + guidance_scale=self.cfg_text, + ).images[0] + + return image diff --git a/build/lib/hy3dgen/texgen/utils/multiview_utils.py b/build/lib/hy3dgen/texgen/utils/multiview_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..ba5708b617e0d58d6d37025fcb94a75324b9e5a9 --- /dev/null +++ b/build/lib/hy3dgen/texgen/utils/multiview_utils.py @@ -0,0 +1,86 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import os +import random + +import numpy as np +import torch +from diffusers import DiffusionPipeline +from diffusers import EulerAncestralDiscreteScheduler + + +class Multiview_Diffusion_Net(): + def __init__(self, config) -> None: + self.device = config.device + self.view_size = 512 + multiview_ckpt_path = config.multiview_ckpt_path + + current_file_path = os.path.abspath(__file__) + custom_pipeline_path = os.path.join(os.path.dirname(current_file_path), '..', 'hunyuanpaint') + + pipeline = DiffusionPipeline.from_pretrained( + multiview_ckpt_path, + custom_pipeline=custom_pipeline_path, torch_dtype=torch.float16) + + pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(pipeline.scheduler.config, + timestep_spacing='trailing') + + pipeline.set_progress_bar_config(disable=True) + self.pipeline = pipeline #.to(self.device) # only for cosmetics and not display the warning + + def seed_everything(self, seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + os.environ["PL_GLOBAL_SEED"] = str(seed) + + def __call__(self, input_image, control_images, camera_info): + + self.seed_everything(0) + + input_image = input_image.resize((self.view_size, self.view_size)) + for i in range(len(control_images)): + control_images[i] = control_images[i].resize((self.view_size, self.view_size)) + if control_images[i].mode == 'L': + control_images[i] = control_images[i].point(lambda x: 255 if x > 1 else 0, mode='1') + + kwargs = dict(generator=torch.Generator(device=self.pipeline.device).manual_seed(0)) + + num_view = len(control_images) // 2 + normal_image = [[control_images[i] for i in range(num_view)]] + position_image = [[control_images[i + num_view] for i in range(num_view)]] + + camera_info_gen = [camera_info] + camera_info_ref = [[0]] + kwargs['width'] = self.view_size + kwargs['height'] = self.view_size + kwargs['num_in_batch'] = num_view + kwargs['camera_info_gen'] = camera_info_gen + kwargs['camera_info_ref'] = camera_info_ref + kwargs["normal_imgs"] = normal_image + kwargs["position_imgs"] = position_image + + mvd_image = self.pipeline(input_image, num_inference_steps=30, **kwargs).images + return mvd_image diff --git a/build/lib/hy3dgen/texgen/utils/simplify_mesh_utils.py b/build/lib/hy3dgen/texgen/utils/simplify_mesh_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..915284d337e648c57fae886dee3333c0203856b6 --- /dev/null +++ b/build/lib/hy3dgen/texgen/utils/simplify_mesh_utils.py @@ -0,0 +1,46 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import trimesh + + +def remesh_mesh(mesh_path, remesh_path, method='trimesh'): + if method == 'trimesh': + mesh_simplify_trimesh(mesh_path, remesh_path) + else: + raise f'Method {method} has not been implemented.' + + +def mesh_simplify_trimesh(inputpath, outputpath): + import pymeshlab + ms = pymeshlab.MeshSet() + ms.load_new_mesh(inputpath, load_in_a_single_layer=True) + ms.save_current_mesh(outputpath.replace('.glb', '.obj'), save_textures=False) + + courent = trimesh.load(outputpath.replace('.glb', '.obj'), force='mesh') + face_num = courent.faces.shape[0] + + if face_num > 100000: + courent = courent.simplify_quadric_decimation(40000) + courent.export(outputpath) diff --git a/build/lib/hy3dgen/texgen/utils/uv_warp_utils.py b/build/lib/hy3dgen/texgen/utils/uv_warp_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..b4f4082274b900aebcdbfcf29a7d6a9532dfa8cb --- /dev/null +++ b/build/lib/hy3dgen/texgen/utils/uv_warp_utils.py @@ -0,0 +1,42 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import trimesh +import xatlas + + +def mesh_uv_wrap(mesh): + if isinstance(mesh, trimesh.Scene): + mesh = mesh.dump(concatenate=True) + + # if len(mesh.faces) > 50000: + # raise ValueError("The mesh has more than 50,000 faces, which is not supported.") + + vmapping, indices, uvs = xatlas.parametrize(mesh.vertices, mesh.faces) + + mesh.vertices = mesh.vertices[vmapping] + mesh.faces = indices + mesh.visual.uv = uvs + + return mesh diff --git a/build/lib/hy3dgen/text2image.py b/build/lib/hy3dgen/text2image.py new file mode 100644 index 0000000000000000000000000000000000000000..be920672cb72238cbe49cba930e3e02a7c287b82 --- /dev/null +++ b/build/lib/hy3dgen/text2image.py @@ -0,0 +1,93 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + + +import os +import random + +import numpy as np +import torch +from diffusers import AutoPipelineForText2Image + + +def seed_everything(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + os.environ["PL_GLOBAL_SEED"] = str(seed) + + +class HunyuanDiTPipeline: + def __init__( + self, + model_path="Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers-Distilled", + device='cpu' + ): + torch.set_default_device('cpu') + self.device = device + self.pipe = AutoPipelineForText2Image.from_pretrained( + model_path, + torch_dtype=torch.float16, + enable_pag=True, + pag_applied_layers=["blocks.(16|17|18|19)"] + ) # .to(device) # needed to avoid displaying the warning + self.pos_txt = ",白色背景,3D风格,最佳质量" + self.neg_txt = "文本,特写,裁剪,出框,最差质量,低质量,JPEG伪影,PGLY,重复,病态," \ + "残缺,多余的手指,变异的手,画得不好的手,画得不好的脸,变异,畸形,模糊,脱水,糟糕的解剖学," \ + "糟糕的比例,多余的肢体,克隆的脸,毁容,恶心的比例,畸形的肢体,缺失的手臂,缺失的腿," \ + "额外的手臂,额外的腿,融合的手指,手指太多,长脖子" + + def compile(self): + # accelarate hunyuan-dit transformer,first inference will cost long time + torch.set_float32_matmul_precision('high') + self.pipe.transformer = torch.compile(self.pipe.transformer, fullgraph=True) + # self.pipe.vae.decode = torch.compile(self.pipe.vae.decode, fullgraph=True) + generator = torch.Generator(device=self.pipe.device) # infer once for hot-start + out_img = self.pipe( + prompt='美少女战士', + negative_prompt='模糊', + num_inference_steps=25, + pag_scale=1.3, + width=1024, + height=1024, + generator=generator, + return_dict=False + )[0][0] + + @torch.no_grad() + def __call__(self, prompt, seed=0): + seed_everything(seed) + generator = torch.Generator(device="cuda") #self.pipe.device + generator = generator.manual_seed(int(seed)) + out_img = self.pipe( + prompt=self.pos_txt+prompt, + negative_prompt=self.neg_txt, + num_inference_steps=20, + pag_scale=1.3, + width=1024, + height=1024, + generator=generator, + return_dict=False + )[0][0] + return out_img diff --git a/dist/hy3dgen-2.0.0-py3.12.egg b/dist/hy3dgen-2.0.0-py3.12.egg new file mode 100644 index 0000000000000000000000000000000000000000..31ccfc3573626346a5da66f2afb8405d256c18db Binary files /dev/null and b/dist/hy3dgen-2.0.0-py3.12.egg differ diff --git a/hy3dgen.egg-info/PKG-INFO b/hy3dgen.egg-info/PKG-INFO new file mode 100644 index 0000000000000000000000000000000000000000..072b650de9e413b42adedbd8c4f9ce6e4df77b2a --- /dev/null +++ b/hy3dgen.egg-info/PKG-INFO @@ -0,0 +1,3 @@ +Metadata-Version: 2.2 +Name: hy3dgen +Version: 2.0.0 diff --git a/hy3dgen.egg-info/SOURCES.txt b/hy3dgen.egg-info/SOURCES.txt new file mode 100644 index 0000000000000000000000000000000000000000..5315f30372b3939c016975706a397c13051e4977 --- /dev/null +++ b/hy3dgen.egg-info/SOURCES.txt @@ -0,0 +1,37 @@ +README.md +setup.py +hy3dgen/__init__.py +hy3dgen/rembg.py +hy3dgen/text2image.py +hy3dgen.egg-info/PKG-INFO +hy3dgen.egg-info/SOURCES.txt +hy3dgen.egg-info/dependency_links.txt +hy3dgen.egg-info/top_level.txt +hy3dgen/shapegen/__init__.py +hy3dgen/shapegen/pipelines.py +hy3dgen/shapegen/postprocessors.py +hy3dgen/shapegen/preprocessors.py +hy3dgen/shapegen/schedulers.py +hy3dgen/shapegen/models/__init__.py +hy3dgen/shapegen/models/conditioner.py +hy3dgen/shapegen/models/hunyuan3ddit.py +hy3dgen/shapegen/models/vae.py +hy3dgen/texgen/__init__.py +hy3dgen/texgen/pipelines.py +hy3dgen/texgen/differentiable_renderer/__init__.py +hy3dgen/texgen/differentiable_renderer/camera_utils.py +hy3dgen/texgen/differentiable_renderer/mesh_processor.py +hy3dgen/texgen/differentiable_renderer/mesh_render.py +hy3dgen/texgen/differentiable_renderer/mesh_utils.py +hy3dgen/texgen/differentiable_renderer/setup.py +hy3dgen/texgen/hunyuanpaint/__init__.py +hy3dgen/texgen/hunyuanpaint/pipeline.py +hy3dgen/texgen/hunyuanpaint/unet/__init__.py +hy3dgen/texgen/hunyuanpaint/unet/modules.py +hy3dgen/texgen/utils/__init__.py +hy3dgen/texgen/utils/alignImg4Tex_utils.py +hy3dgen/texgen/utils/counter_utils.py +hy3dgen/texgen/utils/dehighlight_utils.py +hy3dgen/texgen/utils/multiview_utils.py +hy3dgen/texgen/utils/simplify_mesh_utils.py +hy3dgen/texgen/utils/uv_warp_utils.py \ No newline at end of file diff --git a/hy3dgen.egg-info/dependency_links.txt b/hy3dgen.egg-info/dependency_links.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/hy3dgen.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/hy3dgen.egg-info/top_level.txt b/hy3dgen.egg-info/top_level.txt new file mode 100644 index 0000000000000000000000000000000000000000..3d056144c5932e8fef7a7738a2306943b27bfa60 --- /dev/null +++ b/hy3dgen.egg-info/top_level.txt @@ -0,0 +1 @@ +hy3dgen diff --git a/hy3dgen/__init__.py b/hy3dgen/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e307c3f8c1292da02f308e4b59ef0bcd6fe7305e --- /dev/null +++ b/hy3dgen/__init__.py @@ -0,0 +1,23 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. diff --git a/hy3dgen/rembg.py b/hy3dgen/rembg.py new file mode 100644 index 0000000000000000000000000000000000000000..c0d99483c8354fc10c6689b5cf12ebcd44368d92 --- /dev/null +++ b/hy3dgen/rembg.py @@ -0,0 +1,36 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + + +from PIL import Image +from rembg import remove, new_session + + +class BackgroundRemover(): + def __init__(self): + self.session = new_session() + + def __call__(self, image: Image.Image): + output = remove(image, session=self.session, bgcolor=[255, 255, 255, 0]) + return output diff --git a/hy3dgen/shapegen/__init__.py b/hy3dgen/shapegen/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d1f9534c15d029511d910d29e45da5ba7b8c8714 --- /dev/null +++ b/hy3dgen/shapegen/__init__.py @@ -0,0 +1,27 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +from .pipelines import Hunyuan3DDiTPipeline, Hunyuan3DDiTFlowMatchingPipeline +from .postprocessors import FaceReducer, FloaterRemover, DegenerateFaceRemover +from .preprocessors import ImageProcessorV2, IMAGE_PROCESSORS, DEFAULT_IMAGEPROCESSOR diff --git a/hy3dgen/shapegen/models/__init__.py b/hy3dgen/shapegen/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..684b3e389737fb988f5e363e777c34f6cd1fe4ea --- /dev/null +++ b/hy3dgen/shapegen/models/__init__.py @@ -0,0 +1,28 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + + +from .conditioner import DualImageEncoder, SingleImageEncoder, DinoImageEncoder, CLIPImageEncoder +from .hunyuan3ddit import Hunyuan3DDiT +from .vae import ShapeVAE diff --git a/hy3dgen/shapegen/models/conditioner.py b/hy3dgen/shapegen/models/conditioner.py new file mode 100644 index 0000000000000000000000000000000000000000..1af4c0cc440a193167c0837621c3494242b95f3d --- /dev/null +++ b/hy3dgen/shapegen/models/conditioner.py @@ -0,0 +1,165 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import torch +import torch.nn as nn +from torchvision import transforms +from transformers import ( + CLIPVisionModelWithProjection, + CLIPVisionConfig, + Dinov2Model, + Dinov2Config, +) + + +class ImageEncoder(nn.Module): + def __init__( + self, + version=None, + config=None, + use_cls_token=True, + image_size=224, + **kwargs, + ): + super().__init__() + + if config is None: + self.model = self.MODEL_CLASS.from_pretrained(version) + else: + self.model = self.MODEL_CLASS(self.MODEL_CONFIG_CLASS.from_dict(config)) + self.model.eval() + self.model.requires_grad_(False) + self.use_cls_token = use_cls_token + self.size = image_size // 14 + self.num_patches = (image_size // 14) ** 2 + if self.use_cls_token: + self.num_patches += 1 + + self.transform = transforms.Compose( + [ + transforms.Resize(image_size, transforms.InterpolationMode.BILINEAR, antialias=True), + transforms.CenterCrop(image_size), + transforms.Normalize( + mean=self.mean, + std=self.std, + ), + ] + ) + + def forward(self, image, mask=None, value_range=(-1, 1)): + if value_range is not None: + low, high = value_range + image = (image - low) / (high - low) + + image = image.to(self.model.device, dtype=self.model.dtype) + inputs = self.transform(image) + outputs = self.model(inputs) + + last_hidden_state = outputs.last_hidden_state + if not self.use_cls_token: + last_hidden_state = last_hidden_state[:, 1:, :] + + return last_hidden_state + + def unconditional_embedding(self, batch_size): + device = next(self.model.parameters()).device + dtype = next(self.model.parameters()).dtype + zero = torch.zeros( + batch_size, + self.num_patches, + self.model.config.hidden_size, + device=device, + dtype=dtype, + ) + + return zero + + +class CLIPImageEncoder(ImageEncoder): + MODEL_CLASS = CLIPVisionModelWithProjection + MODEL_CONFIG_CLASS = CLIPVisionConfig + mean = [0.48145466, 0.4578275, 0.40821073] + std = [0.26862954, 0.26130258, 0.27577711] + + +class DinoImageEncoder(ImageEncoder): + MODEL_CLASS = Dinov2Model + MODEL_CONFIG_CLASS = Dinov2Config + mean = [0.485, 0.456, 0.406] + std = [0.229, 0.224, 0.225] + + +def build_image_encoder(config): + if config['type'] == 'CLIPImageEncoder': + return CLIPImageEncoder(**config['kwargs']) + elif config['type'] == 'DinoImageEncoder': + return DinoImageEncoder(**config['kwargs']) + else: + raise ValueError(f'Unknown image encoder type: {config["type"]}') + + +class DualImageEncoder(nn.Module): + def __init__( + self, + main_image_encoder, + additional_image_encoder, + ): + super().__init__() + self.main_image_encoder = build_image_encoder(main_image_encoder) + self.additional_image_encoder = build_image_encoder(additional_image_encoder) + + def forward(self, image, mask=None): + outputs = { + 'main': self.main_image_encoder(image, mask=mask), + 'additional': self.additional_image_encoder(image, mask=mask), + } + return outputs + + def unconditional_embedding(self, batch_size): + outputs = { + 'main': self.main_image_encoder.unconditional_embedding(batch_size), + 'additional': self.additional_image_encoder.unconditional_embedding(batch_size), + } + return outputs + + +class SingleImageEncoder(nn.Module): + def __init__( + self, + main_image_encoder, + ): + super().__init__() + self.main_image_encoder = build_image_encoder(main_image_encoder) + + def forward(self, image, mask=None): + outputs = { + 'main': self.main_image_encoder(image, mask=mask), + } + return outputs + + def unconditional_embedding(self, batch_size): + outputs = { + 'main': self.main_image_encoder.unconditional_embedding(batch_size), + } + return outputs diff --git a/hy3dgen/shapegen/models/hunyuan3ddit.py b/hy3dgen/shapegen/models/hunyuan3ddit.py new file mode 100644 index 0000000000000000000000000000000000000000..d1c778666890cb13538eba15460cf0c05c7f9130 --- /dev/null +++ b/hy3dgen/shapegen/models/hunyuan3ddit.py @@ -0,0 +1,390 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import math +from dataclasses import dataclass +from typing import List, Tuple, Optional + +import torch +from einops import rearrange +from torch import Tensor, nn + + +def attention(q: Tensor, k: Tensor, v: Tensor, **kwargs) -> Tensor: + x = torch.nn.functional.scaled_dot_product_attention(q, k, v) + x = rearrange(x, "B H L D -> B L (H D)") + return x + + +def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 1000.0): + """ + Create sinusoidal timestep embeddings. + :param t: a 1-D Tensor of N indices, one per batch element. + These may be fractional. + :param dim: the dimension of the output. + :param max_period: controls the minimum frequency of the embeddings. + :return: an (N, D) Tensor of positional embeddings. + """ + t = time_factor * t + half = dim // 2 + freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to( + t.device + ) + + args = t[:, None].float() * freqs[None] + embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1) + if dim % 2: + embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1) + if torch.is_floating_point(t): + embedding = embedding.to(t) + return embedding + + +class MLPEmbedder(nn.Module): + def __init__(self, in_dim: int, hidden_dim: int): + super().__init__() + self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True) + self.silu = nn.SiLU() + self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True) + + def forward(self, x: Tensor) -> Tensor: + return self.out_layer(self.silu(self.in_layer(x))) + + +class RMSNorm(torch.nn.Module): + def __init__(self, dim: int): + super().__init__() + self.scale = nn.Parameter(torch.ones(dim)) + + def forward(self, x: Tensor): + x_dtype = x.dtype + x = x.float() + rrms = torch.rsqrt(torch.mean(x ** 2, dim=-1, keepdim=True) + 1e-6) + return (x * rrms).to(dtype=x_dtype) * self.scale + + +class QKNorm(torch.nn.Module): + def __init__(self, dim: int): + super().__init__() + self.query_norm = RMSNorm(dim) + self.key_norm = RMSNorm(dim) + + def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tuple[Tensor, Tensor]: + q = self.query_norm(q) + k = self.key_norm(k) + return q.to(v), k.to(v) + + +class SelfAttention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + ): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.norm = QKNorm(head_dim) + self.proj = nn.Linear(dim, dim) + + def forward(self, x: Tensor, pe: Tensor) -> Tensor: + qkv = self.qkv(x) + q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads) + q, k = self.norm(q, k, v) + x = attention(q, k, v, pe=pe) + x = self.proj(x) + return x + + +@dataclass +class ModulationOut: + shift: Tensor + scale: Tensor + gate: Tensor + + +class Modulation(nn.Module): + def __init__(self, dim: int, double: bool): + super().__init__() + self.is_double = double + self.multiplier = 6 if double else 3 + self.lin = nn.Linear(dim, self.multiplier * dim, bias=True) + + def forward(self, vec: Tensor) -> Tuple[ModulationOut, Optional[ModulationOut]]: + out = self.lin(nn.functional.silu(vec))[:, None, :] + out = out.chunk(self.multiplier, dim=-1) + + return ( + ModulationOut(*out[:3]), + ModulationOut(*out[3:]) if self.is_double else None, + ) + + +class DoubleStreamBlock(nn.Module): + def __init__( + self, + hidden_size: int, + num_heads: int, + mlp_ratio: float, + qkv_bias: bool = False, + ): + super().__init__() + mlp_hidden_dim = int(hidden_size * mlp_ratio) + self.num_heads = num_heads + self.hidden_size = hidden_size + self.img_mod = Modulation(hidden_size, double=True) + self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) + self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias) + + self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) + self.img_mlp = nn.Sequential( + nn.Linear(hidden_size, mlp_hidden_dim, bias=True), + nn.GELU(approximate="tanh"), + nn.Linear(mlp_hidden_dim, hidden_size, bias=True), + ) + + self.txt_mod = Modulation(hidden_size, double=True) + self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) + self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias) + + self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) + self.txt_mlp = nn.Sequential( + nn.Linear(hidden_size, mlp_hidden_dim, bias=True), + nn.GELU(approximate="tanh"), + nn.Linear(mlp_hidden_dim, hidden_size, bias=True), + ) + + def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor) -> Tuple[Tensor, Tensor]: + img_mod1, img_mod2 = self.img_mod(vec) + txt_mod1, txt_mod2 = self.txt_mod(vec) + + img_modulated = self.img_norm1(img) + img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift + img_qkv = self.img_attn.qkv(img_modulated) + img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads) + img_q, img_k = self.img_attn.norm(img_q, img_k, img_v) + + txt_modulated = self.txt_norm1(txt) + txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift + txt_qkv = self.txt_attn.qkv(txt_modulated) + txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads) + txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v) + + q = torch.cat((txt_q, img_q), dim=2) + k = torch.cat((txt_k, img_k), dim=2) + v = torch.cat((txt_v, img_v), dim=2) + + attn = attention(q, k, v, pe=pe) + txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1]:] + + img = img + img_mod1.gate * self.img_attn.proj(img_attn) + img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift) + + txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn) + txt = txt + txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift) + return img, txt + + +class SingleStreamBlock(nn.Module): + """ + A DiT block with parallel linear layers as described in + https://arxiv.org/abs/2302.05442 and adapted modulation interface. + """ + + def __init__( + self, + hidden_size: int, + num_heads: int, + mlp_ratio: float = 4.0, + qk_scale: Optional[float] = None, + ): + super().__init__() + + self.hidden_dim = hidden_size + self.num_heads = num_heads + head_dim = hidden_size // num_heads + self.scale = qk_scale or head_dim ** -0.5 + + self.mlp_hidden_dim = int(hidden_size * mlp_ratio) + # qkv and mlp_in + self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim) + # proj and mlp_out + self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size) + + self.norm = QKNorm(head_dim) + + self.hidden_size = hidden_size + self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) + + self.mlp_act = nn.GELU(approximate="tanh") + self.modulation = Modulation(hidden_size, double=False) + + def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor: + mod, _ = self.modulation(vec) + + x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift + qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1) + + q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads) + q, k = self.norm(q, k, v) + + # compute attention + attn = attention(q, k, v, pe=pe) + # compute activation in mlp stream, cat again and run second linear layer + output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2)) + return x + mod.gate * output + + +class LastLayer(nn.Module): + def __init__(self, hidden_size: int, patch_size: int, out_channels: int): + super().__init__() + self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) + self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True) + self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True)) + + def forward(self, x: Tensor, vec: Tensor) -> Tensor: + shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1) + x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :] + x = self.linear(x) + return x + + +class Hunyuan3DDiT(nn.Module): + def __init__( + self, + in_channels: int = 64, + context_in_dim: int = 1536, + hidden_size: int = 1024, + mlp_ratio: float = 4.0, + num_heads: int = 16, + depth: int = 16, + depth_single_blocks: int = 32, + axes_dim: List[int] = [64], + theta: int = 10_000, + qkv_bias: bool = True, + time_factor: float = 1000, + ckpt_path: Optional[str] = None, + **kwargs, + ): + super().__init__() + self.in_channels = in_channels + self.context_in_dim = context_in_dim + self.hidden_size = hidden_size + self.mlp_ratio = mlp_ratio + self.num_heads = num_heads + self.depth = depth + self.depth_single_blocks = depth_single_blocks + self.axes_dim = axes_dim + self.theta = theta + self.qkv_bias = qkv_bias + self.time_factor = time_factor + self.out_channels = self.in_channels + + if hidden_size % num_heads != 0: + raise ValueError( + f"Hidden size {hidden_size} must be divisible by num_heads {num_heads}" + ) + pe_dim = hidden_size // num_heads + if sum(axes_dim) != pe_dim: + raise ValueError(f"Got {axes_dim} but expected positional dim {pe_dim}") + self.hidden_size = hidden_size + self.num_heads = num_heads + self.latent_in = nn.Linear(self.in_channels, self.hidden_size, bias=True) + self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) + self.cond_in = nn.Linear(context_in_dim, self.hidden_size) + + self.double_blocks = nn.ModuleList( + [ + DoubleStreamBlock( + self.hidden_size, + self.num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + ) + for _ in range(depth) + ] + ) + + self.single_blocks = nn.ModuleList( + [ + SingleStreamBlock( + self.hidden_size, + self.num_heads, + mlp_ratio=mlp_ratio, + ) + for _ in range(depth_single_blocks) + ] + ) + + self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels) + + if ckpt_path is not None: + print('restored denoiser ckpt', ckpt_path) + + ckpt = torch.load(ckpt_path, map_location="cpu") + if 'state_dict' not in ckpt: + # deepspeed ckpt + state_dict = {} + for k in ckpt.keys(): + new_k = k.replace('_forward_module.', '') + state_dict[new_k] = ckpt[k] + else: + state_dict = ckpt["state_dict"] + + final_state_dict = {} + for k, v in state_dict.items(): + if k.startswith('model.'): + final_state_dict[k.replace('model.', '')] = v + else: + final_state_dict[k] = v + missing, unexpected = self.load_state_dict(final_state_dict, strict=False) + print('unexpected keys:', unexpected) + print('missing keys:', missing) + + def forward( + self, + x, + t, + contexts, + **kwargs, + ) -> Tensor: + cond = contexts['main'] + latent = self.latent_in(x) + vec = self.time_in(timestep_embedding(t, 256, self.time_factor).to(dtype=latent.dtype)) + cond = self.cond_in(cond) + pe = None + + for block in self.double_blocks: + latent, cond = block(img=latent, txt=cond, vec=vec, pe=pe) + + latent = torch.cat((cond, latent), 1) + for block in self.single_blocks: + latent = block(latent, vec=vec, pe=pe) + + latent = latent[:, cond.shape[1]:, ...] + latent = self.final_layer(latent, vec) + return latent diff --git a/hy3dgen/shapegen/models/vae.py b/hy3dgen/shapegen/models/vae.py new file mode 100644 index 0000000000000000000000000000000000000000..aef2784ac0db653714e711d12697eafc962c2aa3 --- /dev/null +++ b/hy3dgen/shapegen/models/vae.py @@ -0,0 +1,636 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +from typing import Tuple, List, Union, Optional + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange, repeat +from skimage import measure +from tqdm import tqdm + + +class FourierEmbedder(nn.Module): + """The sin/cosine positional embedding. Given an input tensor `x` of shape [n_batch, ..., c_dim], it converts + each feature dimension of `x[..., i]` into: + [ + sin(x[..., i]), + sin(f_1*x[..., i]), + sin(f_2*x[..., i]), + ... + sin(f_N * x[..., i]), + cos(x[..., i]), + cos(f_1*x[..., i]), + cos(f_2*x[..., i]), + ... + cos(f_N * x[..., i]), + x[..., i] # only present if include_input is True. + ], here f_i is the frequency. + + Denote the space is [0 / num_freqs, 1 / num_freqs, 2 / num_freqs, 3 / num_freqs, ..., (num_freqs - 1) / num_freqs]. + If logspace is True, then the frequency f_i is [2^(0 / num_freqs), ..., 2^(i / num_freqs), ...]; + Otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1)]. + + Args: + num_freqs (int): the number of frequencies, default is 6; + logspace (bool): If logspace is True, then the frequency f_i is [..., 2^(i / num_freqs), ...], + otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1)]; + input_dim (int): the input dimension, default is 3; + include_input (bool): include the input tensor or not, default is True. + + Attributes: + frequencies (torch.Tensor): If logspace is True, then the frequency f_i is [..., 2^(i / num_freqs), ...], + otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1); + + out_dim (int): the embedding size, if include_input is True, it is input_dim * (num_freqs * 2 + 1), + otherwise, it is input_dim * num_freqs * 2. + + """ + + def __init__(self, + num_freqs: int = 6, + logspace: bool = True, + input_dim: int = 3, + include_input: bool = True, + include_pi: bool = True) -> None: + + """The initialization""" + + super().__init__() + + if logspace: + frequencies = 2.0 ** torch.arange( + num_freqs, + dtype=torch.float32 + ) + else: + frequencies = torch.linspace( + 1.0, + 2.0 ** (num_freqs - 1), + num_freqs, + dtype=torch.float32 + ) + + if include_pi: + frequencies *= torch.pi + + self.register_buffer("frequencies", frequencies, persistent=False) + self.include_input = include_input + self.num_freqs = num_freqs + + self.out_dim = self.get_dims(input_dim) + + def get_dims(self, input_dim): + temp = 1 if self.include_input or self.num_freqs == 0 else 0 + out_dim = input_dim * (self.num_freqs * 2 + temp) + + return out_dim + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ Forward process. + + Args: + x: tensor of shape [..., dim] + + Returns: + embedding: an embedding of `x` of shape [..., dim * (num_freqs * 2 + temp)] + where temp is 1 if include_input is True and 0 otherwise. + """ + + if self.num_freqs > 0: + embed = (x[..., None].contiguous() * self.frequencies).view(*x.shape[:-1], -1) + if self.include_input: + return torch.cat((x, embed.sin(), embed.cos()), dim=-1) + else: + return torch.cat((embed.sin(), embed.cos()), dim=-1) + else: + return x + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + + def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + self.scale_by_keep = scale_by_keep + + def forward(self, x): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + This is the same as the DropConnect impl I created for EfficientNet, etc networks, however, + the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for + changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use + 'survival rate' as the argument. + + """ + if self.drop_prob == 0. or not self.training: + return x + keep_prob = 1 - self.drop_prob + shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = x.new_empty(shape).bernoulli_(keep_prob) + if keep_prob > 0.0 and self.scale_by_keep: + random_tensor.div_(keep_prob) + return x * random_tensor + + def extra_repr(self): + return f'drop_prob={round(self.drop_prob, 3):0.3f}' + + +class MLP(nn.Module): + def __init__( + self, *, + width: int, + output_width: int = None, + drop_path_rate: float = 0.0 + ): + super().__init__() + self.width = width + self.c_fc = nn.Linear(width, width * 4) + self.c_proj = nn.Linear(width * 4, output_width if output_width is not None else width) + self.gelu = nn.GELU() + self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity() + + def forward(self, x): + return self.drop_path(self.c_proj(self.gelu(self.c_fc(x)))) + + +class QKVMultiheadCrossAttention(nn.Module): + def __init__( + self, + *, + heads: int, + n_data: Optional[int] = None, + width=None, + qk_norm=False, + norm_layer=nn.LayerNorm + ): + super().__init__() + self.heads = heads + self.n_data = n_data + self.q_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity() + self.k_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity() + + def forward(self, q, kv): + _, n_ctx, _ = q.shape + bs, n_data, width = kv.shape + attn_ch = width // self.heads // 2 + q = q.view(bs, n_ctx, self.heads, -1) + kv = kv.view(bs, n_data, self.heads, -1) + k, v = torch.split(kv, attn_ch, dim=-1) + + q = self.q_norm(q) + k = self.k_norm(k) + + q, k, v = map(lambda t: rearrange(t, 'b n h d -> b h n d', h=self.heads), (q, k, v)) + out = F.scaled_dot_product_attention(q, k, v).transpose(1, 2).reshape(bs, n_ctx, -1) + + return out + + +class MultiheadCrossAttention(nn.Module): + def __init__( + self, + *, + width: int, + heads: int, + qkv_bias: bool = True, + n_data: Optional[int] = None, + data_width: Optional[int] = None, + norm_layer=nn.LayerNorm, + qk_norm: bool = False + ): + super().__init__() + self.n_data = n_data + self.width = width + self.heads = heads + self.data_width = width if data_width is None else data_width + self.c_q = nn.Linear(width, width, bias=qkv_bias) + self.c_kv = nn.Linear(self.data_width, width * 2, bias=qkv_bias) + self.c_proj = nn.Linear(width, width) + self.attention = QKVMultiheadCrossAttention( + heads=heads, + n_data=n_data, + width=width, + norm_layer=norm_layer, + qk_norm=qk_norm + ) + + def forward(self, x, data): + x = self.c_q(x) + data = self.c_kv(data) + x = self.attention(x, data) + x = self.c_proj(x) + return x + + +class ResidualCrossAttentionBlock(nn.Module): + def __init__( + self, + *, + n_data: Optional[int] = None, + width: int, + heads: int, + data_width: Optional[int] = None, + qkv_bias: bool = True, + norm_layer=nn.LayerNorm, + qk_norm: bool = False + ): + super().__init__() + + if data_width is None: + data_width = width + + self.attn = MultiheadCrossAttention( + n_data=n_data, + width=width, + heads=heads, + data_width=data_width, + qkv_bias=qkv_bias, + norm_layer=norm_layer, + qk_norm=qk_norm + ) + self.ln_1 = norm_layer(width, elementwise_affine=True, eps=1e-6) + self.ln_2 = norm_layer(data_width, elementwise_affine=True, eps=1e-6) + self.ln_3 = norm_layer(width, elementwise_affine=True, eps=1e-6) + self.mlp = MLP(width=width) + + def forward(self, x: torch.Tensor, data: torch.Tensor): + x = x + self.attn(self.ln_1(x), self.ln_2(data)) + x = x + self.mlp(self.ln_3(x)) + return x + + +class QKVMultiheadAttention(nn.Module): + def __init__( + self, + *, + heads: int, + n_ctx: int, + width=None, + qk_norm=False, + norm_layer=nn.LayerNorm + ): + super().__init__() + self.heads = heads + self.n_ctx = n_ctx + self.q_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity() + self.k_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity() + + def forward(self, qkv): + bs, n_ctx, width = qkv.shape + attn_ch = width // self.heads // 3 + qkv = qkv.view(bs, n_ctx, self.heads, -1) + q, k, v = torch.split(qkv, attn_ch, dim=-1) + + q = self.q_norm(q) + k = self.k_norm(k) + + q, k, v = map(lambda t: rearrange(t, 'b n h d -> b h n d', h=self.heads), (q, k, v)) + out = F.scaled_dot_product_attention(q, k, v).transpose(1, 2).reshape(bs, n_ctx, -1) + return out + + +class MultiheadAttention(nn.Module): + def __init__( + self, + *, + n_ctx: int, + width: int, + heads: int, + qkv_bias: bool, + norm_layer=nn.LayerNorm, + qk_norm: bool = False, + drop_path_rate: float = 0.0 + ): + super().__init__() + self.n_ctx = n_ctx + self.width = width + self.heads = heads + self.c_qkv = nn.Linear(width, width * 3, bias=qkv_bias) + self.c_proj = nn.Linear(width, width) + self.attention = QKVMultiheadAttention( + heads=heads, + n_ctx=n_ctx, + width=width, + norm_layer=norm_layer, + qk_norm=qk_norm + ) + self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity() + + def forward(self, x): + x = self.c_qkv(x) + x = self.attention(x) + x = self.drop_path(self.c_proj(x)) + return x + + +class ResidualAttentionBlock(nn.Module): + def __init__( + self, + *, + n_ctx: int, + width: int, + heads: int, + qkv_bias: bool = True, + norm_layer=nn.LayerNorm, + qk_norm: bool = False, + drop_path_rate: float = 0.0, + ): + super().__init__() + self.attn = MultiheadAttention( + n_ctx=n_ctx, + width=width, + heads=heads, + qkv_bias=qkv_bias, + norm_layer=norm_layer, + qk_norm=qk_norm, + drop_path_rate=drop_path_rate + ) + self.ln_1 = norm_layer(width, elementwise_affine=True, eps=1e-6) + self.mlp = MLP(width=width, drop_path_rate=drop_path_rate) + self.ln_2 = norm_layer(width, elementwise_affine=True, eps=1e-6) + + def forward(self, x: torch.Tensor): + x = x + self.attn(self.ln_1(x)) + x = x + self.mlp(self.ln_2(x)) + return x + + +class Transformer(nn.Module): + def __init__( + self, + *, + n_ctx: int, + width: int, + layers: int, + heads: int, + qkv_bias: bool = True, + norm_layer=nn.LayerNorm, + qk_norm: bool = False, + drop_path_rate: float = 0.0 + ): + super().__init__() + self.n_ctx = n_ctx + self.width = width + self.layers = layers + self.resblocks = nn.ModuleList( + [ + ResidualAttentionBlock( + n_ctx=n_ctx, + width=width, + heads=heads, + qkv_bias=qkv_bias, + norm_layer=norm_layer, + qk_norm=qk_norm, + drop_path_rate=drop_path_rate + ) + for _ in range(layers) + ] + ) + + def forward(self, x: torch.Tensor): + for block in self.resblocks: + x = block(x) + return x + + +class CrossAttentionDecoder(nn.Module): + + def __init__( + self, + *, + num_latents: int, + out_channels: int, + fourier_embedder: FourierEmbedder, + width: int, + heads: int, + qkv_bias: bool = True, + qk_norm: bool = False, + label_type: str = "binary" + ): + super().__init__() + + self.fourier_embedder = fourier_embedder + + self.query_proj = nn.Linear(self.fourier_embedder.out_dim, width) + + self.cross_attn_decoder = ResidualCrossAttentionBlock( + n_data=num_latents, + width=width, + heads=heads, + qkv_bias=qkv_bias, + qk_norm=qk_norm + ) + + self.ln_post = nn.LayerNorm(width) + self.output_proj = nn.Linear(width, out_channels) + self.label_type = label_type + + def forward(self, queries: torch.FloatTensor, latents: torch.FloatTensor): + queries = self.query_proj(self.fourier_embedder(queries).to(latents.dtype)) + x = self.cross_attn_decoder(queries, latents) + x = self.ln_post(x) + occ = self.output_proj(x) + return occ + + +def generate_dense_grid_points(bbox_min: np.ndarray, + bbox_max: np.ndarray, + octree_depth: int, + indexing: str = "ij", + octree_resolution: int = None, + ): + length = bbox_max - bbox_min + num_cells = np.exp2(octree_depth) + if octree_resolution is not None: + num_cells = octree_resolution + + x = np.linspace(bbox_min[0], bbox_max[0], int(num_cells) + 1, dtype=np.float32) + y = np.linspace(bbox_min[1], bbox_max[1], int(num_cells) + 1, dtype=np.float32) + z = np.linspace(bbox_min[2], bbox_max[2], int(num_cells) + 1, dtype=np.float32) + [xs, ys, zs] = np.meshgrid(x, y, z, indexing=indexing) + xyz = np.stack((xs, ys, zs), axis=-1) + xyz = xyz.reshape(-1, 3) + grid_size = [int(num_cells) + 1, int(num_cells) + 1, int(num_cells) + 1] + + return xyz, grid_size, length + + +def center_vertices(vertices): + """Translate the vertices so that bounding box is centered at zero.""" + vert_min = vertices.min(dim=0)[0] + vert_max = vertices.max(dim=0)[0] + vert_center = 0.5 * (vert_min + vert_max) + return vertices - vert_center + + +class Latent2MeshOutput: + + def __init__(self, mesh_v=None, mesh_f=None): + self.mesh_v = mesh_v + self.mesh_f = mesh_f + + +class ShapeVAE(nn.Module): + def __init__( + self, + *, + num_latents: int, + embed_dim: int, + width: int, + heads: int, + num_decoder_layers: int, + num_freqs: int = 8, + include_pi: bool = True, + qkv_bias: bool = True, + qk_norm: bool = False, + label_type: str = "binary", + drop_path_rate: float = 0.0, + scale_factor: float = 1.0, + ): + super().__init__() + self.fourier_embedder = FourierEmbedder(num_freqs=num_freqs, include_pi=include_pi) + + self.post_kl = nn.Linear(embed_dim, width) + + self.transformer = Transformer( + n_ctx=num_latents, + width=width, + layers=num_decoder_layers, + heads=heads, + qkv_bias=qkv_bias, + qk_norm=qk_norm, + drop_path_rate=drop_path_rate + ) + + self.geo_decoder = CrossAttentionDecoder( + fourier_embedder=self.fourier_embedder, + out_channels=1, + num_latents=num_latents, + width=width, + heads=heads, + qkv_bias=qkv_bias, + qk_norm=qk_norm, + label_type=label_type, + ) + + self.scale_factor = scale_factor + self.latent_shape = (num_latents, embed_dim) + + def forward(self, latents): + latents = self.post_kl(latents) + latents = self.transformer(latents) + return latents + + @torch.no_grad() + def latents2mesh( + self, + latents: torch.FloatTensor, + bounds: Union[Tuple[float], List[float], float] = 1.1, + octree_depth: int = 7, + num_chunks: int = 10000, + mc_level: float = -1 / 512, + octree_resolution: int = None, + mc_algo: str = 'dmc', + ): + device = latents.device + + # 1. generate query points + if isinstance(bounds, float): + bounds = [-bounds, -bounds, -bounds, bounds, bounds, bounds] + bbox_min = np.array(bounds[0:3]) + bbox_max = np.array(bounds[3:6]) + bbox_size = bbox_max - bbox_min + xyz_samples, grid_size, length = generate_dense_grid_points( + bbox_min=bbox_min, + bbox_max=bbox_max, + octree_depth=octree_depth, + octree_resolution=octree_resolution, + indexing="ij" + ) + xyz_samples = torch.FloatTensor(xyz_samples) + + # 2. latents to 3d volume + batch_logits = [] + batch_size = latents.shape[0] + for start in tqdm(range(0, xyz_samples.shape[0], num_chunks), + desc=f"MC Level {mc_level} Implicit Function:"): + queries = xyz_samples[start: start + num_chunks, :].to(device) + queries = queries.half() + batch_queries = repeat(queries, "p c -> b p c", b=batch_size) + + logits = self.geo_decoder(batch_queries.to(latents.dtype), latents) + if mc_level == -1: + mc_level = 0 + logits = torch.sigmoid(logits) * 2 - 1 + print(f'Training with soft labels, inference with sigmoid and marching cubes level 0.') + batch_logits.append(logits) + grid_logits = torch.cat(batch_logits, dim=1) + grid_logits = grid_logits.view((batch_size, grid_size[0], grid_size[1], grid_size[2])).float() + + # 3. extract surface + outputs = [] + for i in range(batch_size): + try: + if mc_algo == 'mc': + vertices, faces, normals, _ = measure.marching_cubes( + grid_logits[i].cpu().numpy(), + mc_level, + method="lewiner" + ) + vertices = vertices / grid_size * bbox_size + bbox_min + elif mc_algo == 'dmc': + if not hasattr(self, 'dmc'): + try: + from diso import DiffDMC + except: + raise ImportError("Please install diso via `pip install diso`, or set mc_algo to 'mc'") + self.dmc = DiffDMC(dtype=torch.float32).to(device) + octree_resolution = 2 ** octree_depth if octree_resolution is None else octree_resolution + sdf = -grid_logits[i] / octree_resolution + verts, faces = self.dmc(sdf, deform=None, return_quads=False, normalize=True) + verts = center_vertices(verts) + vertices = verts.detach().cpu().numpy() + faces = faces.detach().cpu().numpy()[:, ::-1] + else: + raise ValueError(f"mc_algo {mc_algo} not supported.") + + outputs.append( + Latent2MeshOutput( + mesh_v=vertices.astype(np.float32), + mesh_f=np.ascontiguousarray(faces) + ) + ) + + except ValueError: + outputs.append(None) + except RuntimeError: + outputs.append(None) + + return outputs diff --git a/hy3dgen/shapegen/pipelines.py b/hy3dgen/shapegen/pipelines.py new file mode 100644 index 0000000000000000000000000000000000000000..e4f10a49c4c153121a9b581d20f9e36a0f168499 --- /dev/null +++ b/hy3dgen/shapegen/pipelines.py @@ -0,0 +1,589 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import copy +import importlib +import inspect +import logging +import os +from typing import List, Optional, Union + +import numpy as np +import torch +import trimesh +import yaml +from PIL import Image +from diffusers.utils.torch_utils import randn_tensor +from tqdm import tqdm + +logger = logging.getLogger(__name__) + + +def retrieve_timesteps( + scheduler, + num_inference_steps: Optional[int] = None, + device: Optional[Union[str, torch.device]] = None, + timesteps: Optional[List[int]] = None, + sigmas: Optional[List[float]] = None, + **kwargs, +): + """ + Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles + custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. + + Args: + scheduler (`SchedulerMixin`): + The scheduler to get timesteps from. + num_inference_steps (`int`): + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. + device (`str` or `torch.device`, *optional*): + The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. + timesteps (`List[int]`, *optional*): + Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, + `num_inference_steps` and `sigmas` must be `None`. + sigmas (`List[float]`, *optional*): + Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, + `num_inference_steps` and `timesteps` must be `None`. + + Returns: + `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the + second element is the number of inference steps. + """ + if timesteps is not None and sigmas is not None: + raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") + if timesteps is not None: + accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accepts_timesteps: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" timestep schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + elif sigmas is not None: + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accept_sigmas: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" sigmas schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + else: + scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) + timesteps = scheduler.timesteps + return timesteps, num_inference_steps + + +def export_to_trimesh(mesh_output): + if isinstance(mesh_output, list): + outputs = [] + for mesh in mesh_output: + if mesh is None: + outputs.append(None) + else: + mesh.mesh_f = mesh.mesh_f[:, ::-1] + mesh_output = trimesh.Trimesh(mesh.mesh_v, mesh.mesh_f) + outputs.append(mesh_output) + return outputs + else: + mesh_output.mesh_f = mesh_output.mesh_f[:, ::-1] + mesh_output = trimesh.Trimesh(mesh_output.mesh_v, mesh_output.mesh_f) + return mesh_output + + +def get_obj_from_str(string, reload=False): + module, cls = string.rsplit(".", 1) + if reload: + module_imp = importlib.import_module(module) + importlib.reload(module_imp) + return getattr(importlib.import_module(module, package=None), cls) + + +def instantiate_from_config(config, **kwargs): + if "target" not in config: + raise KeyError("Expected key `target` to instantiate.") + cls = get_obj_from_str(config["target"]) + params = config.get("params", dict()) + kwargs.update(params) + instance = cls(**kwargs) + return instance + + +class Hunyuan3DDiTPipeline: + @classmethod + def from_single_file( + cls, + ckpt_path, + config_path, + device='cpu', + dtype=torch.float16, + **kwargs, + ): + # load config + with open(config_path, 'r') as f: + config = yaml.safe_load(f) + + # load ckpt + if not os.path.exists(ckpt_path): + raise FileNotFoundError(f"Model file {ckpt_path} not found") + logger.info(f"Loading model from {ckpt_path}") + + if ckpt_path.endswith('.safetensors'): + # parse safetensors + import safetensors.torch + safetensors_ckpt = safetensors.torch.load_file(ckpt_path, device='cpu') + ckpt = {} + for key, value in safetensors_ckpt.items(): + model_name = key.split('.')[0] + new_key = key[len(model_name) + 1:] + if model_name not in ckpt: + ckpt[model_name] = {} + ckpt[model_name][new_key] = value + else: + ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=True) + + # load model + from accelerate import init_empty_weights + with init_empty_weights(): + model = instantiate_from_config(config['model']) + vae = instantiate_from_config(config['vae']) + conditioner = instantiate_from_config(config['conditioner']) + image_processor = instantiate_from_config(config['image_processor']) + scheduler = instantiate_from_config(config['scheduler']) + + model.load_state_dict(ckpt['model'], assign = True) + vae.load_state_dict(ckpt['vae'], assign = True) + if 'conditioner' in ckpt: + conditioner.load_state_dict(ckpt['conditioner'], assign = True) + + model_kwargs = dict( + vae=vae, + model=model, + scheduler=scheduler, + conditioner=conditioner, + image_processor=image_processor, + device=device, + dtype=dtype, + ) + model_kwargs.update(kwargs) + + return cls( + **model_kwargs + ) + + @classmethod + def from_pretrained( + cls, + model_path, + device='cuda', + dtype=torch.float16, + use_safetensors=None, + variant=None, + subfolder='hunyuan3d-dit-v2-0', + **kwargs, + ): + original_model_path = model_path + if not os.path.exists(model_path): + # try local path + base_dir = os.environ.get('HY3DGEN_MODELS', '~/.cache/hy3dgen') + model_path = os.path.expanduser(os.path.join(base_dir, model_path, subfolder)) + if not os.path.exists(model_path): + try: + import huggingface_hub + # download from huggingface + path = huggingface_hub.snapshot_download(repo_id=original_model_path) + model_path = os.path.join(path, subfolder) + except ImportError: + logger.warning( + "You need to install HuggingFace Hub to load models from the hub." + ) + raise RuntimeError(f"Model path {model_path} not found") + if not os.path.exists(model_path): + raise FileNotFoundError(f"Model path {original_model_path} not found") + + extension = 'ckpt' if not use_safetensors else 'safetensors' + variant = '' if variant is None else f'.{variant}' + ckpt_name = f'model{variant}.{extension}' + config_path = os.path.join(model_path, 'config.yaml') + ckpt_path = os.path.join(model_path, ckpt_name) + + return cls.from_single_file( + ckpt_path, + config_path, + device=device, + dtype=dtype, + use_safetensors=use_safetensors, + variant=variant, + **kwargs + ) + + def __init__( + self, + vae, + model, + scheduler, + conditioner, + image_processor, + device='cuda', + dtype=torch.float16, + **kwargs + ): + self.vae = vae + self.model = model + self.scheduler = scheduler + self.conditioner = conditioner + self.image_processor = image_processor + + self.to(device, dtype) + + def to(self, device=None, dtype=None): + if device is not None: + self.device = torch.device(device) + self.vae.to(device) + self.model.to(device) + self.conditioner.to(device) + if dtype is not None: + self.dtype = dtype + self.vae.to(dtype=dtype) + self.model.to(dtype=dtype) + self.conditioner.to(dtype=dtype) + + def encode_cond(self, image, mask, do_classifier_free_guidance, dual_guidance): + bsz = image.shape[0] + cond = self.conditioner(image=image, mask=mask) + + if do_classifier_free_guidance: + un_cond = self.conditioner.unconditional_embedding(bsz) + + if dual_guidance: + un_cond_drop_main = copy.deepcopy(un_cond) + un_cond_drop_main['additional'] = cond['additional'] + + def cat_recursive(a, b, c): + if isinstance(a, torch.Tensor): + return torch.cat([a, b, c], dim=0).to(self.dtype) + out = {} + for k in a.keys(): + out[k] = cat_recursive(a[k], b[k], c[k]) + return out + + cond = cat_recursive(cond, un_cond_drop_main, un_cond) + else: + un_cond = self.conditioner.unconditional_embedding(bsz) + + def cat_recursive(a, b): + if isinstance(a, torch.Tensor): + return torch.cat([a, b], dim=0).to(self.dtype) + out = {} + for k in a.keys(): + out[k] = cat_recursive(a[k], b[k]) + return out + + cond = cat_recursive(cond, un_cond) + return cond + + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs + + def prepare_latents(self, batch_size, dtype, device, generator, latents=None): + shape = (batch_size, *self.vae.latent_shape) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + if latents is None: + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + else: + latents = latents.to(device) + + # scale the initial noise by the standard deviation required by the scheduler + latents = latents * getattr(self.scheduler, 'init_noise_sigma', 1.0) + return latents + + def prepare_image(self, image): + if isinstance(image, str) and not os.path.exists(image): + raise FileNotFoundError(f"Couldn't find image at path {image}") + + if not isinstance(image, list): + image = [image] + image_pts = [] + mask_pts = [] + for img in image: + image_pt, mask_pt = self.image_processor(img, return_mask=True) + image_pts.append(image_pt) + mask_pts.append(mask_pt) + + image_pts = torch.cat(image_pts, dim=0).to(self.device, dtype=self.dtype) + if mask_pts[0] is not None: + mask_pts = torch.cat(mask_pts, dim=0).to(self.device, dtype=self.dtype) + else: + mask_pts = None + return image_pts, mask_pts + + def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32): + """ + See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 + + Args: + timesteps (`torch.Tensor`): + generate embedding vectors at these timesteps + embedding_dim (`int`, *optional*, defaults to 512): + dimension of the embeddings to generate + dtype: + data type of the generated embeddings + + Returns: + `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + """ + assert len(w.shape) == 1 + w = w * 1000.0 + + half_dim = embedding_dim // 2 + emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1) + emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb) + emb = w.to(dtype)[:, None] * emb[None, :] + emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1) + if embedding_dim % 2 == 1: # zero pad + emb = torch.nn.functional.pad(emb, (0, 1)) + assert emb.shape == (w.shape[0], embedding_dim) + return emb + + @torch.no_grad() + def __call__( + self, + image: Union[str, List[str], Image.Image] = None, + num_inference_steps: int = 50, + timesteps: List[int] = None, + sigmas: List[float] = None, + eta: float = 0.0, + guidance_scale: float = 7.5, + dual_guidance_scale: float = 10.5, + dual_guidance: bool = True, + generator=None, + box_v=1.01, + octree_resolution=384, + mc_level=-1 / 512, + num_chunks=8000, + mc_algo='mc', + output_type: Optional[str] = "trimesh", + enable_pbar=True, + **kwargs, + ) -> List[List[trimesh.Trimesh]]: + callback = kwargs.pop("callback", None) + callback_steps = kwargs.pop("callback_steps", None) + + device = self.device + dtype = self.dtype + do_classifier_free_guidance = guidance_scale >= 0 and \ + getattr(self.model, 'guidance_cond_proj_dim', None) is None + dual_guidance = dual_guidance_scale >= 0 and dual_guidance + + image, mask = self.prepare_image(image) + cond = self.encode_cond(image=image, + mask=mask, + do_classifier_free_guidance=do_classifier_free_guidance, + dual_guidance=dual_guidance) + batch_size = image.shape[0] + + t_dtype = torch.long + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, num_inference_steps, device, timesteps, sigmas) + + latents = self.prepare_latents(batch_size, dtype, device, generator) + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + guidance_cond = None + if getattr(self.model, 'guidance_cond_proj_dim', None) is not None: + print('Using lcm guidance scale') + guidance_scale_tensor = torch.tensor(guidance_scale - 1).repeat(batch_size) + guidance_cond = self.get_guidance_scale_embedding( + guidance_scale_tensor, embedding_dim=self.model.guidance_cond_proj_dim + ).to(device=device, dtype=latents.dtype) + + for i, t in enumerate(tqdm(timesteps, disable=not enable_pbar, desc="Diffusion Sampling:", leave=False)): + # expand the latents if we are doing classifier free guidance + if do_classifier_free_guidance: + latent_model_input = torch.cat([latents] * (3 if dual_guidance else 2)) + else: + latent_model_input = latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + + # predict the noise residual + timestep_tensor = torch.tensor([t], dtype=t_dtype, device=device) + timestep_tensor = timestep_tensor.expand(latent_model_input.shape[0]) + noise_pred = self.model(latent_model_input, timestep_tensor, cond, guidance_cond=guidance_cond) + + # no drop, drop clip, all drop + if do_classifier_free_guidance: + if dual_guidance: + noise_pred_clip, noise_pred_dino, noise_pred_uncond = noise_pred.chunk(3) + noise_pred = ( + noise_pred_uncond + + guidance_scale * (noise_pred_clip - noise_pred_dino) + + dual_guidance_scale * (noise_pred_dino - noise_pred_uncond) + ) + else: + noise_pred_cond, noise_pred_uncond = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond) + + # compute the previous noisy sample x_t -> x_t-1 + outputs = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs) + latents = outputs.prev_sample + + if callback is not None and i % callback_steps == 0: + step_idx = i // getattr(self.scheduler, "order", 1) + callback(step_idx, t, outputs) + + return self._export( + latents, + output_type, + box_v, mc_level, num_chunks, octree_resolution, mc_algo, + ) + + def _export(self, latents, output_type, box_v, mc_level, num_chunks, octree_resolution, mc_algo): + if not output_type == "latent": + latents = 1. / self.vae.scale_factor * latents + latents = self.vae(latents) + outputs = self.vae.latents2mesh( + latents, + bounds=box_v, + mc_level=mc_level, + num_chunks=num_chunks, + octree_resolution=octree_resolution, + mc_algo=mc_algo, + ) + else: + outputs = latents + + if output_type == 'trimesh': + outputs = export_to_trimesh(outputs) + + return outputs + + +class Hunyuan3DDiTFlowMatchingPipeline(Hunyuan3DDiTPipeline): + + @torch.no_grad() + def __call__( + self, + image: Union[str, List[str], Image.Image] = None, + num_inference_steps: int = 50, + timesteps: List[int] = None, + sigmas: List[float] = None, + eta: float = 0.0, + guidance_scale: float = 7.5, + generator=None, + box_v=1.01, + octree_resolution=384, + mc_level=0.0, + mc_algo='mc', + num_chunks=8000, + output_type: Optional[str] = "trimesh", + enable_pbar=True, + **kwargs, + ) -> List[List[trimesh.Trimesh]]: + callback = kwargs.pop("callback", None) + callback_steps = kwargs.pop("callback_steps", None) + + device = self.device + dtype = self.dtype + do_classifier_free_guidance = guidance_scale >= 0 and not ( + hasattr(self.model, 'guidance_embed') and + self.model.guidance_embed is True + ) + + image, mask = self.prepare_image(image) + cond = self.encode_cond( + image=image, + mask=mask, + do_classifier_free_guidance=do_classifier_free_guidance, + dual_guidance=False, + ) + batch_size = image.shape[0] + + # 5. Prepare timesteps + # NOTE: this is slightly different from common usage, we start from 0. + sigmas = np.linspace(0, 1, num_inference_steps) if sigmas is None else sigmas + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, + num_inference_steps, + device, + sigmas=sigmas, + ) + latents = self.prepare_latents(batch_size, dtype, device, generator) + + guidance = None + if hasattr(self.model, 'guidance_embed') and \ + self.model.guidance_embed is True: + guidance = torch.tensor([guidance_scale] * batch_size, device=device, dtype=dtype) + + for i, t in enumerate(tqdm(timesteps, disable=not enable_pbar, desc="Diffusion Sampling:")): + # expand the latents if we are doing classifier free guidance + if do_classifier_free_guidance: + latent_model_input = torch.cat([latents] * 2) + else: + latent_model_input = latents + + # NOTE: we assume model get timesteps ranged from 0 to 1 + timestep = t.expand(latent_model_input.shape[0]).to( + latents.dtype) / self.scheduler.config.num_train_timesteps + noise_pred = self.model(latent_model_input, timestep, cond, guidance=guidance) + + if do_classifier_free_guidance: + noise_pred_cond, noise_pred_uncond = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond) + + # compute the previous noisy sample x_t -> x_t-1 + outputs = self.scheduler.step(noise_pred, t, latents) + latents = outputs.prev_sample + + if callback is not None and i % callback_steps == 0: + step_idx = i // getattr(self.scheduler, "order", 1) + callback(step_idx, t, outputs) + + return self._export( + latents, + output_type, + box_v, mc_level, num_chunks, octree_resolution, mc_algo, + ) diff --git a/hy3dgen/shapegen/postprocessors.py b/hy3dgen/shapegen/postprocessors.py new file mode 100644 index 0000000000000000000000000000000000000000..0500fa2d8f70a3a933f8313d11126ad9b27bf57c --- /dev/null +++ b/hy3dgen/shapegen/postprocessors.py @@ -0,0 +1,175 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import os +import tempfile +from typing import Union + +import pymeshlab +import trimesh + +from .models.vae import Latent2MeshOutput + + +def load_mesh(path): + if path.endswith(".glb"): + mesh = trimesh.load(path) + else: + mesh = pymeshlab.MeshSet() + mesh.load_new_mesh(path) + return mesh + + +def reduce_face(mesh: pymeshlab.MeshSet, max_facenum: int = 200000): + mesh.apply_filter( + "meshing_decimation_quadric_edge_collapse", + targetfacenum=max_facenum, + qualitythr=1.0, + preserveboundary=True, + boundaryweight=3, + preservenormal=True, + preservetopology=True, + autoclean=True + ) + return mesh + + +def remove_floater(mesh: pymeshlab.MeshSet): + mesh.apply_filter("compute_selection_by_small_disconnected_components_per_face", + nbfaceratio=0.005) + mesh.apply_filter("compute_selection_transfer_face_to_vertex", inclusive=False) + mesh.apply_filter("meshing_remove_selected_vertices_and_faces") + return mesh + + +def pymeshlab2trimesh(mesh: pymeshlab.MeshSet): + temp_file = tempfile.NamedTemporaryFile(suffix='.ply', delete=True) + temp_file.close() + temp_file_name = temp_file.name + + mesh.save_current_mesh(temp_file_name) + mesh = trimesh.load(temp_file_name) + if os.path.exists(temp_file_name): + os.remove(temp_file_name) + + # 检查加载的对象类型 + if isinstance(mesh, trimesh.Scene): + combined_mesh = trimesh.Trimesh() + # 如果是Scene,遍历所有的geometry并合并 + for geom in mesh.geometry.values(): + combined_mesh = trimesh.util.concatenate([combined_mesh, geom]) + mesh = combined_mesh + return mesh + + +def trimesh2pymeshlab(mesh: trimesh.Trimesh): + temp_file = tempfile.NamedTemporaryFile(suffix='.ply', delete=True) + temp_file.close() + temp_file_name = temp_file.name + + if isinstance(mesh, trimesh.scene.Scene): + for idx, obj in enumerate(mesh.geometry.values()): + if idx == 0: + temp_mesh = obj + else: + temp_mesh = temp_mesh + obj + mesh = temp_mesh + mesh.export(temp_file_name) + mesh = pymeshlab.MeshSet() + mesh.load_new_mesh(temp_file_name) + if os.path.exists(temp_file_name): + os.remove(temp_file_name) + + return mesh + + +def export_mesh(input, output): + if isinstance(input, pymeshlab.MeshSet): + mesh = output + elif isinstance(input, Latent2MeshOutput): + output = Latent2MeshOutput() + output.mesh_v = output.current_mesh().vertex_matrix() + output.mesh_f = output.current_mesh().face_matrix() + mesh = output + else: + mesh = pymeshlab2trimesh(output) + return mesh + + +def import_mesh(mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str]) -> pymeshlab.MeshSet: + if isinstance(mesh, str): + mesh = load_mesh(mesh) + elif isinstance(mesh, Latent2MeshOutput): + mesh = pymeshlab.MeshSet() + mesh_pymeshlab = pymeshlab.Mesh(vertex_matrix=mesh.mesh_v, face_matrix=mesh.mesh_f) + mesh.add_mesh(mesh_pymeshlab, "converted_mesh") + + if isinstance(mesh, (trimesh.Trimesh, trimesh.scene.Scene)): + mesh = trimesh2pymeshlab(mesh) + + return mesh + + +class FaceReducer: + def __call__( + self, + mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str], + max_facenum: int = 40000 + ) -> Union[pymeshlab.MeshSet, trimesh.Trimesh]: + ms = import_mesh(mesh) + ms = reduce_face(ms, max_facenum=max_facenum) + mesh = export_mesh(mesh, ms) + return mesh + + +class FloaterRemover: + def __call__( + self, + mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str], + ) -> Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput]: + ms = import_mesh(mesh) + ms = remove_floater(ms) + mesh = export_mesh(mesh, ms) + return mesh + + +class DegenerateFaceRemover: + def __call__( + self, + mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str], + ) -> Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput]: + ms = import_mesh(mesh) + + temp_file = tempfile.NamedTemporaryFile(suffix='.ply', delete=True) + temp_file.close() + temp_file_name = temp_file.name + + ms.save_current_mesh(temp_file_name) + ms = pymeshlab.MeshSet() + ms.load_new_mesh(temp_file_name) + if os.path.exists(temp_file_name): + os.remove(temp_file_name) + + mesh = export_mesh(mesh, ms) + return mesh diff --git a/hy3dgen/shapegen/preprocessors.py b/hy3dgen/shapegen/preprocessors.py new file mode 100644 index 0000000000000000000000000000000000000000..2bdaff2d16cc0844d8d23c886d35c2f4e7286ff7 --- /dev/null +++ b/hy3dgen/shapegen/preprocessors.py @@ -0,0 +1,127 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import cv2 +import numpy as np +import torch +from PIL import Image +from einops import repeat, rearrange + + +def array_to_tensor(np_array): + image_pt = torch.tensor(np_array).float() + image_pt = image_pt / 255 * 2 - 1 + image_pt = rearrange(image_pt, "h w c -> c h w") + image_pts = repeat(image_pt, "c h w -> b c h w", b=1) + return image_pts + + +class ImageProcessorV2: + def __init__(self, size=512, border_ratio=None): + self.size = size + self.border_ratio = border_ratio + + @staticmethod + def recenter(image, border_ratio: float = 0.2): + """ recenter an image to leave some empty space at the image border. + + Args: + image (ndarray): input image, float/uint8 [H, W, 3/4] + mask (ndarray): alpha mask, bool [H, W] + border_ratio (float, optional): border ratio, image will be resized to (1 - border_ratio). Defaults to 0.2. + + Returns: + ndarray: output image, float/uint8 [H, W, 3/4] + """ + + if image.shape[-1] == 4: + mask = image[..., 3] + else: + mask = np.ones_like(image[..., 0:1]) * 255 + image = np.concatenate([image, mask], axis=-1) + mask = mask[..., 0] + + H, W, C = image.shape + + size = max(H, W) + result = np.zeros((size, size, C), dtype=np.uint8) + + coords = np.nonzero(mask) + x_min, x_max = coords[0].min(), coords[0].max() + y_min, y_max = coords[1].min(), coords[1].max() + h = x_max - x_min + w = y_max - y_min + if h == 0 or w == 0: + raise ValueError('input image is empty') + desired_size = int(size * (1 - border_ratio)) + scale = desired_size / max(h, w) + h2 = int(h * scale) + w2 = int(w * scale) + x2_min = (size - h2) // 2 + x2_max = x2_min + h2 + + y2_min = (size - w2) // 2 + y2_max = y2_min + w2 + + result[x2_min:x2_max, y2_min:y2_max] = cv2.resize(image[x_min:x_max, y_min:y_max], (w2, h2), + interpolation=cv2.INTER_AREA) + + bg = np.ones((result.shape[0], result.shape[1], 3), dtype=np.uint8) * 255 + # bg = np.zeros((result.shape[0], result.shape[1], 3), dtype=np.uint8) * 255 + mask = result[..., 3:].astype(np.float32) / 255 + result = result[..., :3] * mask + bg * (1 - mask) + + mask = mask * 255 + result = result.clip(0, 255).astype(np.uint8) + mask = mask.clip(0, 255).astype(np.uint8) + return result, mask + + def __call__(self, image, border_ratio=0.15, to_tensor=True, return_mask=False, **kwargs): + if self.border_ratio is not None: + border_ratio = self.border_ratio + print(f"Using border_ratio from init: {border_ratio}") + if isinstance(image, str): + image = cv2.imread(image, cv2.IMREAD_UNCHANGED) + image, mask = self.recenter(image, border_ratio=border_ratio) + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + elif isinstance(image, Image.Image): + image = np.asarray(image) + image, mask = self.recenter(image, border_ratio=border_ratio) + + image = cv2.resize(image, (self.size, self.size), interpolation=cv2.INTER_CUBIC) + mask = cv2.resize(mask, (self.size, self.size), interpolation=cv2.INTER_NEAREST) + mask = mask[..., np.newaxis] + + if to_tensor: + image = array_to_tensor(image) + mask = array_to_tensor(mask) + if return_mask: + return image, mask + return image + + +IMAGE_PROCESSORS = { + "v2": ImageProcessorV2, +} + +DEFAULT_IMAGEPROCESSOR = 'v2' diff --git a/hy3dgen/shapegen/schedulers.py b/hy3dgen/shapegen/schedulers.py new file mode 100644 index 0000000000000000000000000000000000000000..0069f5cd49c5095930b588f01129a77f172171a7 --- /dev/null +++ b/hy3dgen/shapegen/schedulers.py @@ -0,0 +1,307 @@ +# Copyright 2024 Stability AI, Katherine Crowson and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + +import numpy as np +import torch +from diffusers.configuration_utils import ConfigMixin, register_to_config +from diffusers.schedulers.scheduling_utils import SchedulerMixin +from diffusers.utils import BaseOutput, logging + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +@dataclass +class FlowMatchEulerDiscreteSchedulerOutput(BaseOutput): + """ + Output class for the scheduler's `step` function output. + + Args: + prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the + denoising loop. + """ + + prev_sample: torch.FloatTensor + + +class FlowMatchEulerDiscreteScheduler(SchedulerMixin, ConfigMixin): + """ + NOTE: this is very similar to diffusers.FlowMatchEulerDiscreteScheduler. Except our timesteps are reversed + + Euler scheduler. + + This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic + methods the library implements for all schedulers such as loading and saving. + + Args: + num_train_timesteps (`int`, defaults to 1000): + The number of diffusion steps to train the model. + timestep_spacing (`str`, defaults to `"linspace"`): + The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and + Sample Steps are Flawed](https://huggingface.co./papers/2305.08891) for more information. + shift (`float`, defaults to 1.0): + The shift value for the timestep schedule. + """ + + _compatibles = [] + order = 1 + + @register_to_config + def __init__( + self, + num_train_timesteps: int = 1000, + shift: float = 1.0, + use_dynamic_shifting=False, + ): + timesteps = np.linspace(1, num_train_timesteps, num_train_timesteps, dtype=np.float32).copy() + timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32) + + sigmas = timesteps / num_train_timesteps + if not use_dynamic_shifting: + # when use_dynamic_shifting is True, we apply the timestep shifting on the fly based on the image resolution + sigmas = shift * sigmas / (1 + (shift - 1) * sigmas) + + self.timesteps = sigmas * num_train_timesteps + + self._step_index = None + self._begin_index = None + + self.sigmas = sigmas.to("cpu") # to avoid too much CPU/GPU communication + self.sigma_min = self.sigmas[-1].item() + self.sigma_max = self.sigmas[0].item() + + @property + def step_index(self): + """ + The index counter for current timestep. It will increase 1 after each scheduler step. + """ + return self._step_index + + @property + def begin_index(self): + """ + The index for the first timestep. It should be set from pipeline with `set_begin_index` method. + """ + return self._begin_index + + # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index + def set_begin_index(self, begin_index: int = 0): + """ + Sets the begin index for the scheduler. This function should be run from pipeline before the inference. + + Args: + begin_index (`int`): + The begin index for the scheduler. + """ + self._begin_index = begin_index + + def scale_noise( + self, + sample: torch.FloatTensor, + timestep: Union[float, torch.FloatTensor], + noise: Optional[torch.FloatTensor] = None, + ) -> torch.FloatTensor: + """ + Forward process in flow-matching + + Args: + sample (`torch.FloatTensor`): + The input sample. + timestep (`int`, *optional*): + The current timestep in the diffusion chain. + + Returns: + `torch.FloatTensor`: + A scaled input sample. + """ + # Make sure sigmas and timesteps have the same device and dtype as original_samples + sigmas = self.sigmas.to(device=sample.device, dtype=sample.dtype) + + if sample.device.type == "mps" and torch.is_floating_point(timestep): + # mps does not support float64 + schedule_timesteps = self.timesteps.to(sample.device, dtype=torch.float32) + timestep = timestep.to(sample.device, dtype=torch.float32) + else: + schedule_timesteps = self.timesteps.to(sample.device) + timestep = timestep.to(sample.device) + + # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index + if self.begin_index is None: + step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timestep] + elif self.step_index is not None: + # add_noise is called after first denoising step (for inpainting) + step_indices = [self.step_index] * timestep.shape[0] + else: + # add noise is called before first denoising step to create initial latent(img2img) + step_indices = [self.begin_index] * timestep.shape[0] + + sigma = sigmas[step_indices].flatten() + while len(sigma.shape) < len(sample.shape): + sigma = sigma.unsqueeze(-1) + + sample = sigma * noise + (1.0 - sigma) * sample + + return sample + + def _sigma_to_t(self, sigma): + return sigma * self.config.num_train_timesteps + + def time_shift(self, mu: float, sigma: float, t: torch.Tensor): + return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma) + + def set_timesteps( + self, + num_inference_steps: int = None, + device: Union[str, torch.device] = None, + sigmas: Optional[List[float]] = None, + mu: Optional[float] = None, + ): + """ + Sets the discrete timesteps used for the diffusion chain (to be run before inference). + + Args: + num_inference_steps (`int`): + The number of diffusion steps used when generating samples with a pre-trained model. + device (`str` or `torch.device`, *optional*): + The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. + """ + + if self.config.use_dynamic_shifting and mu is None: + raise ValueError(" you have a pass a value for `mu` when `use_dynamic_shifting` is set to be `True`") + + if sigmas is None: + self.num_inference_steps = num_inference_steps + timesteps = np.linspace( + self._sigma_to_t(self.sigma_max), self._sigma_to_t(self.sigma_min), num_inference_steps + ) + + sigmas = timesteps / self.config.num_train_timesteps + + if self.config.use_dynamic_shifting: + sigmas = self.time_shift(mu, 1.0, sigmas) + else: + sigmas = self.config.shift * sigmas / (1 + (self.config.shift - 1) * sigmas) + + sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device) + timesteps = sigmas * self.config.num_train_timesteps + + self.timesteps = timesteps.to(device=device) + self.sigmas = torch.cat([sigmas, torch.ones(1, device=sigmas.device)]) + + self._step_index = None + self._begin_index = None + + def index_for_timestep(self, timestep, schedule_timesteps=None): + if schedule_timesteps is None: + schedule_timesteps = self.timesteps + + indices = (schedule_timesteps == timestep).nonzero() + + # The sigma index that is taken for the **very** first `step` + # is always the second index (or the last index if there is only 1) + # This way we can ensure we don't accidentally skip a sigma in + # case we start in the middle of the denoising schedule (e.g. for image-to-image) + pos = 1 if len(indices) > 1 else 0 + + return indices[pos].item() + + def _init_step_index(self, timestep): + if self.begin_index is None: + if isinstance(timestep, torch.Tensor): + timestep = timestep.to(self.timesteps.device) + self._step_index = self.index_for_timestep(timestep) + else: + self._step_index = self._begin_index + + def step( + self, + model_output: torch.FloatTensor, + timestep: Union[float, torch.FloatTensor], + sample: torch.FloatTensor, + s_churn: float = 0.0, + s_tmin: float = 0.0, + s_tmax: float = float("inf"), + s_noise: float = 1.0, + generator: Optional[torch.Generator] = None, + return_dict: bool = True, + ) -> Union[FlowMatchEulerDiscreteSchedulerOutput, Tuple]: + """ + Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion + process from the learned model outputs (most often the predicted noise). + + Args: + model_output (`torch.FloatTensor`): + The direct output from learned diffusion model. + timestep (`float`): + The current discrete timestep in the diffusion chain. + sample (`torch.FloatTensor`): + A current instance of a sample created by the diffusion process. + s_churn (`float`): + s_tmin (`float`): + s_tmax (`float`): + s_noise (`float`, defaults to 1.0): + Scaling factor for noise added to the sample. + generator (`torch.Generator`, *optional*): + A random number generator. + return_dict (`bool`): + Whether or not to return a [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or + tuple. + + Returns: + [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or `tuple`: + If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is + returned, otherwise a tuple is returned where the first element is the sample tensor. + """ + + if ( + isinstance(timestep, int) + or isinstance(timestep, torch.IntTensor) + or isinstance(timestep, torch.LongTensor) + ): + raise ValueError( + ( + "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to" + " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass" + " one of the `scheduler.timesteps` as a timestep." + ), + ) + + if self.step_index is None: + self._init_step_index(timestep) + + # Upcast to avoid precision issues when computing prev_sample + sample = sample.to(torch.float32) + + sigma = self.sigmas[self.step_index] + sigma_next = self.sigmas[self.step_index + 1] + + prev_sample = sample + (sigma_next - sigma) * model_output + + # Cast sample back to model compatible dtype + prev_sample = prev_sample.to(model_output.dtype) + + # upon completion increase step index by one + self._step_index += 1 + + if not return_dict: + return (prev_sample,) + + return FlowMatchEulerDiscreteSchedulerOutput(prev_sample=prev_sample) + + def __len__(self): + return self.config.num_train_timesteps diff --git a/hy3dgen/texgen/__init__.py b/hy3dgen/texgen/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1f890f024d507021eca8087d40dc472de36152bd --- /dev/null +++ b/hy3dgen/texgen/__init__.py @@ -0,0 +1,26 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + + +from .pipelines import Hunyuan3DPaintPipeline, Hunyuan3DTexGenConfig diff --git a/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/__init__.py b/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..df40dcc8d4819eb903263ff1faf70ce902eb7e07 --- /dev/null +++ b/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/__init__.py @@ -0,0 +1,32 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +''' +from .hierarchy import BuildHierarchy, BuildHierarchyWithColor +from .io_obj import LoadObj, LoadObjWithTexture +from .render import rasterize, interpolate +''' +from .io_glb import * +from .io_obj import * +from .render import * diff --git a/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/io_glb.py b/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/io_glb.py new file mode 100644 index 0000000000000000000000000000000000000000..c5d7dc8c6127e62848dda8e79fdc281c5a7b42cb --- /dev/null +++ b/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/io_glb.py @@ -0,0 +1,248 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import base64 +import io +import os + +import numpy as np +from PIL import Image as PILImage +from pygltflib import GLTF2 +from scipy.spatial.transform import Rotation as R + + +# Function to extract buffer data +def get_buffer_data(gltf, buffer_view): + buffer = gltf.buffers[buffer_view.buffer] + buffer_data = gltf.get_data_from_buffer_uri(buffer.uri) + byte_offset = buffer_view.byteOffset if buffer_view.byteOffset else 0 + byte_length = buffer_view.byteLength + return buffer_data[byte_offset:byte_offset + byte_length] + + +# Function to extract attribute data +def get_attribute_data(gltf, accessor_index): + accessor = gltf.accessors[accessor_index] + buffer_view = gltf.bufferViews[accessor.bufferView] + buffer_data = get_buffer_data(gltf, buffer_view) + + comptype = {5120: np.int8, 5121: np.uint8, 5122: np.int16, 5123: np.uint16, 5125: np.uint32, 5126: np.float32} + dtype = comptype[accessor.componentType] + + t2n = {'SCALAR': 1, 'VEC2': 2, 'VEC3': 3, 'VEC4': 4, 'MAT2': 4, 'MAT3': 9, 'MAT4': 16} + num_components = t2n[accessor.type] + + # Calculate the correct slice of data + byte_offset = accessor.byteOffset if accessor.byteOffset else 0 + byte_stride = buffer_view.byteStride if buffer_view.byteStride else num_components * np.dtype(dtype).itemsize + count = accessor.count + + # Extract the attribute data + attribute_data = np.zeros((count, num_components), dtype=dtype) + for i in range(count): + start = byte_offset + i * byte_stride + end = start + num_components * np.dtype(dtype).itemsize + attribute_data[i] = np.frombuffer(buffer_data[start:end], dtype=dtype) + + return attribute_data + + +# Function to extract image data +def get_image_data(gltf, image, folder): + if image.uri: + if image.uri.startswith('data:'): + # Data URI + header, encoded = image.uri.split(',', 1) + data = base64.b64decode(encoded) + else: + # External file + fn = image.uri + if not os.path.isabs(fn): + fn = folder + '/' + fn + with open(fn, 'rb') as f: + data = f.read() + else: + buffer_view = gltf.bufferViews[image.bufferView] + data = get_buffer_data(gltf, buffer_view) + return data + + +# Function to convert triangle strip to triangles +def convert_triangle_strip_to_triangles(indices): + triangles = [] + for i in range(len(indices) - 2): + if i % 2 == 0: + triangles.append([indices[i], indices[i + 1], indices[i + 2]]) + else: + triangles.append([indices[i], indices[i + 2], indices[i + 1]]) + return np.array(triangles).reshape(-1, 3) + + +# Function to convert triangle fan to triangles +def convert_triangle_fan_to_triangles(indices): + triangles = [] + for i in range(1, len(indices) - 1): + triangles.append([indices[0], indices[i], indices[i + 1]]) + return np.array(triangles).reshape(-1, 3) + + +# Function to get the transformation matrix from a node +def get_node_transform(node): + if node.matrix: + return np.array(node.matrix).reshape(4, 4).T + else: + T = np.eye(4) + if node.translation: + T[:3, 3] = node.translation + if node.rotation: + R_mat = R.from_quat(node.rotation).as_matrix() + T[:3, :3] = R_mat + if node.scale: + S = np.diag(node.scale + [1]) + T = T @ S + return T + + +def get_world_transform(gltf, node_index, parents, world_transforms): + if parents[node_index] == -2: + return world_transforms[node_index] + + node = gltf.nodes[node_index] + if parents[node_index] == -1: + world_transforms[node_index] = get_node_transform(node) + parents[node_index] = -2 + return world_transforms[node_index] + + parent_index = parents[node_index] + parent_transform = get_world_transform(gltf, parent_index, parents, world_transforms) + world_transforms[node_index] = parent_transform @ get_node_transform(node) + parents[node_index] = -2 + return world_transforms[node_index] + + +def LoadGlb(path): + # Load the GLB file using pygltflib + gltf = GLTF2().load(path) + + primitives = [] + images = {} + # Iterate through the meshes in the GLB file + + world_transforms = [np.identity(4) for i in range(len(gltf.nodes))] + parents = [-1 for i in range(len(gltf.nodes))] + for node_index, node in enumerate(gltf.nodes): + for idx in node.children: + parents[idx] = node_index + # for i in range(len(gltf.nodes)): + # get_world_transform(gltf, i, parents, world_transform) + + for node_index, node in enumerate(gltf.nodes): + if node.mesh is not None: + world_transform = get_world_transform(gltf, node_index, parents, world_transforms) + # Iterate through the primitives in the mesh + mesh = gltf.meshes[node.mesh] + for primitive in mesh.primitives: + # Access the attributes of the primitive + attributes = primitive.attributes.__dict__ + mode = primitive.mode if primitive.mode is not None else 4 # Default to TRIANGLES + result = {} + if primitive.indices is not None: + indices = get_attribute_data(gltf, primitive.indices) + if mode == 4: # TRIANGLES + face_indices = indices.reshape(-1, 3) + elif mode == 5: # TRIANGLE_STRIP + face_indices = convert_triangle_strip_to_triangles(indices) + elif mode == 6: # TRIANGLE_FAN + face_indices = convert_triangle_fan_to_triangles(indices) + else: + continue + result['F'] = face_indices + + # Extract vertex positions + if 'POSITION' in attributes and attributes['POSITION'] is not None: + positions = get_attribute_data(gltf, attributes['POSITION']) + # Apply the world transformation to the positions + positions_homogeneous = np.hstack([positions, np.ones((positions.shape[0], 1))]) + transformed_positions = (world_transform @ positions_homogeneous.T).T[:, :3] + result['V'] = transformed_positions + + # Extract vertex colors + if 'COLOR_0' in attributes and attributes['COLOR_0'] is not None: + colors = get_attribute_data(gltf, attributes['COLOR_0']) + if colors.shape[-1] > 3: + colors = colors[..., :3] + result['VC'] = colors + + # Extract UVs + if 'TEXCOORD_0' in attributes and not attributes['TEXCOORD_0'] is None: + uvs = get_attribute_data(gltf, attributes['TEXCOORD_0']) + result['UV'] = uvs + + if primitive.material is not None: + material = gltf.materials[primitive.material] + if material.pbrMetallicRoughness is not None and material.pbrMetallicRoughness.baseColorTexture is not None: + texture_index = material.pbrMetallicRoughness.baseColorTexture.index + texture = gltf.textures[texture_index] + image_index = texture.source + if not image_index in images: + image = gltf.images[image_index] + image_data = get_image_data(gltf, image, os.path.dirname(path)) + pil_image = PILImage.open(io.BytesIO(image_data)) + if pil_image.mode != 'RGB': + pil_image = pil_image.convert('RGB') + images[image_index] = pil_image + result['TEX'] = image_index + elif material.emissiveTexture is not None: + texture_index = material.emissiveTexture.index + texture = gltf.textures[texture_index] + image_index = texture.source + if not image_index in images: + image = gltf.images[image_index] + image_data = get_image_data(gltf, image, os.path.dirname(path)) + pil_image = PILImage.open(io.BytesIO(image_data)) + if pil_image.mode != 'RGB': + pil_image = pil_image.convert('RGB') + images[image_index] = pil_image + result['TEX'] = image_index + else: + if material.pbrMetallicRoughness is not None: + base_color = material.pbrMetallicRoughness.baseColorFactor + else: + base_color = np.array([0.8, 0.8, 0.8], dtype=np.float32) + result['MC'] = base_color + + primitives.append(result) + + return primitives, images + + +def RotatePrimitives(primitives, transform): + for i in range(len(primitives)): + if 'V' in primitives[i]: + primitives[i]['V'] = primitives[i]['V'] @ transform.T + + +if __name__ == '__main__': + path = 'data/test.glb' + LoadGlb(path) diff --git a/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/io_obj.py b/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/io_obj.py new file mode 100644 index 0000000000000000000000000000000000000000..a72c478d8efcb9a3d71a67ce5f167559ef76b922 --- /dev/null +++ b/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/io_obj.py @@ -0,0 +1,76 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import cv2 +import numpy as np + + +def LoadObj(fn): + lines = [l.strip() for l in open(fn)] + vertices = [] + faces = [] + for l in lines: + words = [w for w in l.split(' ') if w != ''] + if len(words) == 0: + continue + if words[0] == 'v': + v = [float(words[i]) for i in range(1, 4)] + vertices.append(v) + elif words[0] == 'f': + f = [int(words[i]) - 1 for i in range(1, 4)] + faces.append(f) + + return np.array(vertices).astype('float32'), np.array(faces).astype('int32') + + +def LoadObjWithTexture(fn, tex_fn): + lines = [l.strip() for l in open(fn)] + vertices = [] + vertex_textures = [] + faces = [] + face_textures = [] + for l in lines: + words = [w for w in l.split(' ') if w != ''] + if len(words) == 0: + continue + if words[0] == 'v': + v = [float(words[i]) for i in range(1, len(words))] + vertices.append(v) + elif words[0] == 'vt': + v = [float(words[i]) for i in range(1, len(words))] + vertex_textures.append(v) + elif words[0] == 'f': + f = [] + ft = [] + for i in range(1, len(words)): + t = words[i].split('/') + f.append(int(t[0]) - 1) + ft.append(int(t[1]) - 1) + for i in range(2, len(f)): + faces.append([f[0], f[i - 1], f[i]]) + face_textures.append([ft[0], ft[i - 1], ft[i]]) + + tex_image = cv2.cvtColor(cv2.imread(tex_fn), cv2.COLOR_BGR2RGB) + return np.array(vertices).astype('float32'), np.array(vertex_textures).astype('float32'), np.array(faces).astype( + 'int32'), np.array(face_textures).astype('int32'), tex_image diff --git a/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/render.py b/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/render.py new file mode 100644 index 0000000000000000000000000000000000000000..743d4aac4da9e1e18374ce712ac24d19e6788870 --- /dev/null +++ b/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/render.py @@ -0,0 +1,41 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import custom_rasterizer_kernel +import torch + + +def rasterize(pos, tri, resolution, clamp_depth=torch.zeros(0), use_depth_prior=0): + assert (pos.device == tri.device) + findices, barycentric = custom_rasterizer_kernel.rasterize_image(pos[0], tri, clamp_depth, resolution[1], + resolution[0], 1e-6, use_depth_prior) + return findices, barycentric + + +def interpolate(col, findices, barycentric, tri): + f = findices - 1 + (findices == 0) + vcol = col[0, tri.long()[f.long()]] + result = barycentric.view(*barycentric.shape, 1) * vcol + result = torch.sum(result, axis=-2) + return result.view(1, *result.shape) diff --git a/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/__init__.py b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e307c3f8c1292da02f308e4b59ef0bcd6fe7305e --- /dev/null +++ b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/__init__.py @@ -0,0 +1,23 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. diff --git a/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/grid_neighbor.cpp b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/grid_neighbor.cpp new file mode 100644 index 0000000000000000000000000000000000000000..dab3983eef9cae227710bcdc4d86fc2e50b4e6be --- /dev/null +++ b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/grid_neighbor.cpp @@ -0,0 +1,575 @@ +#include "rasterizer.h" +#include + +inline int pos2key(float* p, int resolution) { + int x = (p[0] * 0.5 + 0.5) * resolution; + int y = (p[1] * 0.5 + 0.5) * resolution; + int z = (p[2] * 0.5 + 0.5) * resolution; + return (x * resolution + y) * resolution + z; +} + +inline void key2pos(int key, int resolution, float* p) { + int x = key / resolution / resolution; + int y = key / resolution % resolution; + int z = key % resolution; + p[0] = ((x + 0.5) / resolution - 0.5) * 2; + p[1] = ((y + 0.5) / resolution - 0.5) * 2; + p[2] = ((z + 0.5) / resolution - 0.5) * 2; +} + +inline void key2cornerpos(int key, int resolution, float* p) { + int x = key / resolution / resolution; + int y = key / resolution % resolution; + int z = key % resolution; + p[0] = ((x + 0.75) / resolution - 0.5) * 2; + p[1] = ((y + 0.25) / resolution - 0.5) * 2; + p[2] = ((z + 0.75) / resolution - 0.5) * 2; +} + +inline float* pos_ptr(int l, int i, int j, torch::Tensor t) { + float* pdata = t.data_ptr(); + int height = t.size(1); + int width = t.size(2); + return &pdata[((l * height + i) * width + j) * 4]; +} + +struct Grid +{ + std::vector seq2oddcorner; + std::vector seq2evencorner; + std::vector seq2grid; + std::vector seq2normal; + std::vector seq2neighbor; + std::unordered_map grid2seq; + std::vector downsample_seq; + int num_origin_seq; + int resolution; + int stride; +}; + +inline void pos_from_seq(Grid& grid, int seq, float* p) { + auto k = grid.seq2grid[seq]; + key2pos(k, grid.resolution, p); +} + +inline int fetch_seq(Grid& grid, int l, int i, int j, torch::Tensor pdata) { + float* p = pos_ptr(l, i, j, pdata); + if (p[3] == 0) + return -1; + auto key = pos2key(p, grid.resolution); + int seq = grid.grid2seq[key]; + return seq; +} + +inline int fetch_last_seq(Grid& grid, int i, int j, torch::Tensor pdata) { + int num_layers = pdata.size(0); + int l = 0; + int idx = fetch_seq(grid, l, i, j, pdata); + while (l < num_layers - 1) { + l += 1; + int new_idx = fetch_seq(grid, l, i, j, pdata); + if (new_idx == -1) + break; + idx = new_idx; + } + return idx; +} + +inline int fetch_nearest_seq(Grid& grid, int i, int j, int dim, float d, torch::Tensor pdata) { + float p[3]; + float max_dist = 1e10; + int best_idx = -1; + int num_layers = pdata.size(0); + for (int l = 0; l < num_layers; ++l) { + int idx = fetch_seq(grid, l, i, j, pdata); + if (idx == -1) + break; + pos_from_seq(grid, idx, p); + float dist = std::abs(d - p[(dim + 2) % 3]); + if (dist < max_dist) { + max_dist = dist; + best_idx = idx; + } + } + return best_idx; +} + +inline int fetch_nearest_seq_layer(Grid& grid, int i, int j, int dim, float d, torch::Tensor pdata) { + float p[3]; + float max_dist = 1e10; + int best_layer = -1; + int num_layers = pdata.size(0); + for (int l = 0; l < num_layers; ++l) { + int idx = fetch_seq(grid, l, i, j, pdata); + if (idx == -1) + break; + pos_from_seq(grid, idx, p); + float dist = std::abs(d - p[(dim + 2) % 3]); + if (dist < max_dist) { + max_dist = dist; + best_layer = l; + } + } + return best_layer; +} + +void FetchNeighbor(Grid& grid, int seq, float* pos, int dim, int boundary_info, std::vector& view_layer_positions, + int* output_indices) +{ + auto t = view_layer_positions[dim]; + int height = t.size(1); + int width = t.size(2); + int top = 0; + int ci = 0; + int cj = 0; + if (dim == 0) { + ci = (pos[1]/2+0.5)*height; + cj = (pos[0]/2+0.5)*width; + } + else if (dim == 1) { + ci = (pos[1]/2+0.5)*height; + cj = (pos[2]/2+0.5)*width; + } + else { + ci = (-pos[2]/2+0.5)*height; + cj = (pos[0]/2+0.5)*width; + } + int stride = grid.stride; + for (int ni = ci + stride; ni >= ci - stride; ni -= stride) { + for (int nj = cj - stride; nj <= cj + stride; nj += stride) { + int idx = -1; + if (ni == ci && nj == cj) + idx = seq; + else if (!(ni < 0 || ni >= height || nj < 0 || nj >= width)) { + if (boundary_info == -1) + idx = fetch_seq(grid, 0, ni, nj, t); + else if (boundary_info == 1) + idx = fetch_last_seq(grid, ni, nj, t); + else + idx = fetch_nearest_seq(grid, ni, nj, dim, pos[(dim + 2) % 3], t); + } + output_indices[top] = idx; + top += 1; + } + } +} + +void DownsampleGrid(Grid& src, Grid& tar) +{ + src.downsample_seq.resize(src.seq2grid.size(), -1); + tar.resolution = src.resolution / 2; + tar.stride = src.stride * 2; + float pos[3]; + std::vector seq2normal_count; + for (int i = 0; i < src.seq2grid.size(); ++i) { + key2pos(src.seq2grid[i], src.resolution, pos); + int k = pos2key(pos, tar.resolution); + int s = seq2normal_count.size(); + if (!tar.grid2seq.count(k)) { + tar.grid2seq[k] = tar.seq2grid.size(); + tar.seq2grid.emplace_back(k); + seq2normal_count.emplace_back(0); + seq2normal_count.emplace_back(0); + seq2normal_count.emplace_back(0); + //tar.seq2normal.emplace_back(src.seq2normal[i]); + } else { + s = tar.grid2seq[k] * 3; + } + seq2normal_count[s + src.seq2normal[i]] += 1; + src.downsample_seq[i] = tar.grid2seq[k]; + } + tar.seq2normal.resize(seq2normal_count.size() / 3); + for (int i = 0; i < seq2normal_count.size(); i += 3) { + int t = 0; + for (int j = 1; j < 3; ++j) { + if (seq2normal_count[i + j] > seq2normal_count[i + t]) + t = j; + } + tar.seq2normal[i / 3] = t; + } +} + +void NeighborGrid(Grid& grid, std::vector view_layer_positions, int v) +{ + grid.seq2evencorner.resize(grid.seq2grid.size(), 0); + grid.seq2oddcorner.resize(grid.seq2grid.size(), 0); + std::unordered_set visited_seq; + for (int vd = 0; vd < 3; ++vd) { + auto t = view_layer_positions[vd]; + auto t0 = view_layer_positions[v]; + int height = t.size(1); + int width = t.size(2); + int num_layers = t.size(0); + int num_view_layers = t0.size(0); + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; ++j) { + for (int l = 0; l < num_layers; ++l) { + int seq = fetch_seq(grid, l, i, j, t); + if (seq == -1) + break; + int dim = grid.seq2normal[seq]; + if (dim != v) + continue; + + float pos[3]; + pos_from_seq(grid, seq, pos); + + int ci = 0; + int cj = 0; + if (dim == 0) { + ci = (pos[1]/2+0.5)*height; + cj = (pos[0]/2+0.5)*width; + } + else if (dim == 1) { + ci = (pos[1]/2+0.5)*height; + cj = (pos[2]/2+0.5)*width; + } + else { + ci = (-pos[2]/2+0.5)*height; + cj = (pos[0]/2+0.5)*width; + } + + if ((ci % (grid.stride * 2) < grid.stride) && (cj % (grid.stride * 2) >= grid.stride)) + grid.seq2evencorner[seq] = 1; + + if ((ci % (grid.stride * 2) >= grid.stride) && (cj % (grid.stride * 2) < grid.stride)) + grid.seq2oddcorner[seq] = 1; + + bool is_boundary = false; + if (vd == v) { + if (l == 0 || l == num_layers - 1) + is_boundary = true; + else { + int seq_new = fetch_seq(grid, l + 1, i, j, t); + if (seq_new == -1) + is_boundary = true; + } + } + int boundary_info = 0; + if (is_boundary && (l == 0)) + boundary_info = -1; + else if (is_boundary) + boundary_info = 1; + if (visited_seq.count(seq)) + continue; + visited_seq.insert(seq); + + FetchNeighbor(grid, seq, pos, dim, boundary_info, view_layer_positions, &grid.seq2neighbor[seq * 9]); + } + } + } + } +} + +void PadGrid(Grid& src, Grid& tar, std::vector& view_layer_positions) { + auto& downsample_seq = src.downsample_seq; + auto& seq2evencorner = src.seq2evencorner; + auto& seq2oddcorner = src.seq2oddcorner; + int indices[9]; + std::vector mapped_even_corners(tar.seq2grid.size(), 0); + std::vector mapped_odd_corners(tar.seq2grid.size(), 0); + for (int i = 0; i < downsample_seq.size(); ++i) { + if (seq2evencorner[i] > 0) { + mapped_even_corners[downsample_seq[i]] = 1; + } + if (seq2oddcorner[i] > 0) { + mapped_odd_corners[downsample_seq[i]] = 1; + } + } + auto& tar_seq2normal = tar.seq2normal; + auto& tar_seq2grid = tar.seq2grid; + for (int i = 0; i < tar_seq2grid.size(); ++i) { + if (mapped_even_corners[i] == 1 && mapped_odd_corners[i] == 1) + continue; + auto k = tar_seq2grid[i]; + float p[3]; + key2cornerpos(k, tar.resolution, p); + + int src_key = pos2key(p, src.resolution); + if (!src.grid2seq.count(src_key)) { + int seq = src.seq2grid.size(); + src.grid2seq[src_key] = seq; + src.seq2evencorner.emplace_back((mapped_even_corners[i] == 0)); + src.seq2oddcorner.emplace_back((mapped_odd_corners[i] == 0)); + src.seq2grid.emplace_back(src_key); + src.seq2normal.emplace_back(tar_seq2normal[i]); + FetchNeighbor(src, seq, p, tar_seq2normal[i], 0, view_layer_positions, indices); + for (int j = 0; j < 9; ++j) { + src.seq2neighbor.emplace_back(indices[j]); + } + src.downsample_seq.emplace_back(i); + } else { + int seq = src.grid2seq[src_key]; + if (mapped_even_corners[i] == 0) + src.seq2evencorner[seq] = 1; + if (mapped_odd_corners[i] == 0) + src.seq2oddcorner[seq] = 1; + } + } +} + +std::vector> build_hierarchy(std::vector view_layer_positions, + std::vector view_layer_normals, int num_level, int resolution) +{ + if (view_layer_positions.size() != 3 || num_level < 1) { + printf("Alert! We require 3 layers and at least 1 level! (%d %d)\n", view_layer_positions.size(), num_level); + return {{},{},{},{}}; + } + + std::vector grids; + grids.resize(num_level); + + std::vector seq2pos; + auto& seq2grid = grids[0].seq2grid; + auto& seq2normal = grids[0].seq2normal; + auto& grid2seq = grids[0].grid2seq; + grids[0].resolution = resolution; + grids[0].stride = 1; + + auto int64_options = torch::TensorOptions().dtype(torch::kInt64).requires_grad(false); + auto float_options = torch::TensorOptions().dtype(torch::kFloat32).requires_grad(false); + + for (int v = 0; v < 3; ++v) { + int num_layers = view_layer_positions[v].size(0); + int height = view_layer_positions[v].size(1); + int width = view_layer_positions[v].size(2); + float* data = view_layer_positions[v].data_ptr(); + float* data_normal = view_layer_normals[v].data_ptr(); + for (int l = 0; l < num_layers; ++l) { + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; ++j) { + float* p = &data[(i * width + j) * 4]; + float* n = &data_normal[(i * width + j) * 3]; + if (p[3] == 0) + continue; + auto k = pos2key(p, resolution); + if (!grid2seq.count(k)) { + int dim = 0; + for (int d = 0; d < 3; ++d) { + if (std::abs(n[d]) > std::abs(n[dim])) + dim = d; + } + dim = (dim + 1) % 3; + grid2seq[k] = seq2grid.size(); + seq2grid.emplace_back(k); + seq2pos.push_back(p[0]); + seq2pos.push_back(p[1]); + seq2pos.push_back(p[2]); + seq2normal.emplace_back(dim); + } + } + } + data += (height * width * 4); + data_normal += (height * width * 3); + } + } + + for (int i = 0; i < num_level - 1; ++i) { + DownsampleGrid(grids[i], grids[i + 1]); + } + + for (int l = 0; l < num_level; ++l) { + grids[l].seq2neighbor.resize(grids[l].seq2grid.size() * 9, -1); + grids[l].num_origin_seq = grids[l].seq2grid.size(); + for (int d = 0; d < 3; ++d) { + NeighborGrid(grids[l], view_layer_positions, d); + } + } + + for (int i = num_level - 2; i >= 0; --i) { + PadGrid(grids[i], grids[i + 1], view_layer_positions); + } + for (int i = grids[0].num_origin_seq; i < grids[0].seq2grid.size(); ++i) { + int k = grids[0].seq2grid[i]; + float p[3]; + key2pos(k, grids[0].resolution, p); + seq2pos.push_back(p[0]); + seq2pos.push_back(p[1]); + seq2pos.push_back(p[2]); + } + + std::vector texture_positions(2); + std::vector grid_neighbors(grids.size()); + std::vector grid_downsamples(grids.size() - 1); + std::vector grid_evencorners(grids.size()); + std::vector grid_oddcorners(grids.size()); + + + texture_positions[0] = torch::zeros({static_cast(seq2pos.size() / 3), static_cast(3)}, float_options); + texture_positions[1] = torch::zeros({static_cast(seq2pos.size() / 3)}, float_options); + float* positions_out_ptr = texture_positions[0].data_ptr(); + memcpy(positions_out_ptr, seq2pos.data(), sizeof(float) * seq2pos.size()); + positions_out_ptr = texture_positions[1].data_ptr(); + for (int i = 0; i < grids[0].seq2grid.size(); ++i) { + positions_out_ptr[i] = (i < grids[0].num_origin_seq); + } + + for (int i = 0; i < grids.size(); ++i) { + grid_neighbors[i] = torch::zeros({static_cast(grids[i].seq2grid.size()), static_cast(9)}, int64_options); + int64_t* nptr = grid_neighbors[i].data_ptr(); + for (int j = 0; j < grids[i].seq2neighbor.size(); ++j) { + nptr[j] = grids[i].seq2neighbor[j]; + } + + grid_evencorners[i] = torch::zeros({static_cast(grids[i].seq2evencorner.size())}, int64_options); + grid_oddcorners[i] = torch::zeros({static_cast(grids[i].seq2oddcorner.size())}, int64_options); + int64_t* dptr = grid_evencorners[i].data_ptr(); + for (int j = 0; j < grids[i].seq2evencorner.size(); ++j) { + dptr[j] = grids[i].seq2evencorner[j]; + } + dptr = grid_oddcorners[i].data_ptr(); + for (int j = 0; j < grids[i].seq2oddcorner.size(); ++j) { + dptr[j] = grids[i].seq2oddcorner[j]; + } + if (i + 1 < grids.size()) { + grid_downsamples[i] = torch::zeros({static_cast(grids[i].downsample_seq.size())}, int64_options); + int64_t* dptr = grid_downsamples[i].data_ptr(); + for (int j = 0; j < grids[i].downsample_seq.size(); ++j) { + dptr[j] = grids[i].downsample_seq[j]; + } + } + + } + return {texture_positions, grid_neighbors, grid_downsamples, grid_evencorners, grid_oddcorners}; +} + +std::vector> build_hierarchy_with_feat( + std::vector view_layer_positions, + std::vector view_layer_normals, + std::vector view_layer_feats, + int num_level, int resolution) +{ + if (view_layer_positions.size() != 3 || num_level < 1) { + printf("Alert! We require 3 layers and at least 1 level! (%d %d)\n", view_layer_positions.size(), num_level); + return {{},{},{},{}}; + } + + std::vector grids; + grids.resize(num_level); + + std::vector seq2pos; + std::vector seq2feat; + auto& seq2grid = grids[0].seq2grid; + auto& seq2normal = grids[0].seq2normal; + auto& grid2seq = grids[0].grid2seq; + grids[0].resolution = resolution; + grids[0].stride = 1; + + auto int64_options = torch::TensorOptions().dtype(torch::kInt64).requires_grad(false); + auto float_options = torch::TensorOptions().dtype(torch::kFloat32).requires_grad(false); + + int feat_channel = 3; + for (int v = 0; v < 3; ++v) { + int num_layers = view_layer_positions[v].size(0); + int height = view_layer_positions[v].size(1); + int width = view_layer_positions[v].size(2); + float* data = view_layer_positions[v].data_ptr(); + float* data_normal = view_layer_normals[v].data_ptr(); + float* data_feat = view_layer_feats[v].data_ptr(); + feat_channel = view_layer_feats[v].size(3); + for (int l = 0; l < num_layers; ++l) { + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; ++j) { + float* p = &data[(i * width + j) * 4]; + float* n = &data_normal[(i * width + j) * 3]; + float* f = &data_feat[(i * width + j) * feat_channel]; + if (p[3] == 0) + continue; + auto k = pos2key(p, resolution); + if (!grid2seq.count(k)) { + int dim = 0; + for (int d = 0; d < 3; ++d) { + if (std::abs(n[d]) > std::abs(n[dim])) + dim = d; + } + dim = (dim + 1) % 3; + grid2seq[k] = seq2grid.size(); + seq2grid.emplace_back(k); + seq2pos.push_back(p[0]); + seq2pos.push_back(p[1]); + seq2pos.push_back(p[2]); + for (int c = 0; c < feat_channel; ++c) { + seq2feat.emplace_back(f[c]); + } + seq2normal.emplace_back(dim); + } + } + } + data += (height * width * 4); + data_normal += (height * width * 3); + data_feat += (height * width * feat_channel); + } + } + + for (int i = 0; i < num_level - 1; ++i) { + DownsampleGrid(grids[i], grids[i + 1]); + } + + for (int l = 0; l < num_level; ++l) { + grids[l].seq2neighbor.resize(grids[l].seq2grid.size() * 9, -1); + grids[l].num_origin_seq = grids[l].seq2grid.size(); + for (int d = 0; d < 3; ++d) { + NeighborGrid(grids[l], view_layer_positions, d); + } + } + + for (int i = num_level - 2; i >= 0; --i) { + PadGrid(grids[i], grids[i + 1], view_layer_positions); + } + for (int i = grids[0].num_origin_seq; i < grids[0].seq2grid.size(); ++i) { + int k = grids[0].seq2grid[i]; + float p[3]; + key2pos(k, grids[0].resolution, p); + seq2pos.push_back(p[0]); + seq2pos.push_back(p[1]); + seq2pos.push_back(p[2]); + for (int c = 0; c < feat_channel; ++c) { + seq2feat.emplace_back(0.5); + } + } + + std::vector texture_positions(2); + std::vector texture_feats(1); + std::vector grid_neighbors(grids.size()); + std::vector grid_downsamples(grids.size() - 1); + std::vector grid_evencorners(grids.size()); + std::vector grid_oddcorners(grids.size()); + + texture_positions[0] = torch::zeros({static_cast(seq2pos.size() / 3), static_cast(3)}, float_options); + texture_positions[1] = torch::zeros({static_cast(seq2pos.size() / 3)}, float_options); + texture_feats[0] = torch::zeros({static_cast(seq2feat.size() / feat_channel), static_cast(feat_channel)}, float_options); + float* positions_out_ptr = texture_positions[0].data_ptr(); + memcpy(positions_out_ptr, seq2pos.data(), sizeof(float) * seq2pos.size()); + positions_out_ptr = texture_positions[1].data_ptr(); + for (int i = 0; i < grids[0].seq2grid.size(); ++i) { + positions_out_ptr[i] = (i < grids[0].num_origin_seq); + } + float* feats_out_ptr = texture_feats[0].data_ptr(); + memcpy(feats_out_ptr, seq2feat.data(), sizeof(float) * seq2feat.size()); + + for (int i = 0; i < grids.size(); ++i) { + grid_neighbors[i] = torch::zeros({static_cast(grids[i].seq2grid.size()), static_cast(9)}, int64_options); + int64_t* nptr = grid_neighbors[i].data_ptr(); + for (int j = 0; j < grids[i].seq2neighbor.size(); ++j) { + nptr[j] = grids[i].seq2neighbor[j]; + } + grid_evencorners[i] = torch::zeros({static_cast(grids[i].seq2evencorner.size())}, int64_options); + grid_oddcorners[i] = torch::zeros({static_cast(grids[i].seq2oddcorner.size())}, int64_options); + int64_t* dptr = grid_evencorners[i].data_ptr(); + for (int j = 0; j < grids[i].seq2evencorner.size(); ++j) { + dptr[j] = grids[i].seq2evencorner[j]; + } + dptr = grid_oddcorners[i].data_ptr(); + for (int j = 0; j < grids[i].seq2oddcorner.size(); ++j) { + dptr[j] = grids[i].seq2oddcorner[j]; + } + if (i + 1 < grids.size()) { + grid_downsamples[i] = torch::zeros({static_cast(grids[i].downsample_seq.size())}, int64_options); + int64_t* dptr = grid_downsamples[i].data_ptr(); + for (int j = 0; j < grids[i].downsample_seq.size(); ++j) { + dptr[j] = grids[i].downsample_seq[j]; + } + } + } + return {texture_positions, texture_feats, grid_neighbors, grid_downsamples, grid_evencorners, grid_oddcorners}; +} diff --git a/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.cpp b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4529d7eb674d5263f5103f7a2c2aa5085ee752d5 --- /dev/null +++ b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.cpp @@ -0,0 +1,139 @@ +#include "rasterizer.h" + +void rasterizeTriangleCPU(int idx, float* vt0, float* vt1, float* vt2, int width, int height, INT64* zbuffer, float* d, float occlusion_truncation) { + float x_min = std::min(vt0[0], std::min(vt1[0],vt2[0])); + float x_max = std::max(vt0[0], std::max(vt1[0],vt2[0])); + float y_min = std::min(vt0[1], std::min(vt1[1],vt2[1])); + float y_max = std::max(vt0[1], std::max(vt1[1],vt2[1])); + + for (int px = x_min; px < x_max + 1; ++px) { + if (px < 0 || px >= width) + continue; + for (int py = y_min; py < y_max + 1; ++py) { + if (py < 0 || py >= height) + continue; + float vt[2] = {px + 0.5f, py + 0.5f}; + float baryCentricCoordinate[3]; + calculateBarycentricCoordinate(vt0, vt1, vt2, vt, baryCentricCoordinate); + if (isBarycentricCoordInBounds(baryCentricCoordinate)) { + int pixel = py * width + px; + if (zbuffer == 0) { + zbuffer[pixel] = (INT64)(idx + 1); + continue; + } + + float depth = baryCentricCoordinate[0] * vt0[2] + baryCentricCoordinate[1] * vt1[2] + baryCentricCoordinate[2] * vt2[2]; + float depth_thres = 0; + if (d) { + depth_thres = d[pixel] * 0.49999f + 0.5f + occlusion_truncation; + } + + int z_quantize = depth * (2<<17); + INT64 token = (INT64)z_quantize * MAXINT + (INT64)(idx + 1); + if (depth < depth_thres) + continue; + zbuffer[pixel] = std::min(zbuffer[pixel], token); + } + } + } +} + +void barycentricFromImgcoordCPU(float* V, int* F, int* findices, INT64* zbuffer, int width, int height, int num_vertices, int num_faces, + float* barycentric_map, int pix) +{ + INT64 f = zbuffer[pix] % MAXINT; + if (f == (MAXINT-1)) { + findices[pix] = 0; + barycentric_map[pix * 3] = 0; + barycentric_map[pix * 3 + 1] = 0; + barycentric_map[pix * 3 + 2] = 0; + return; + } + findices[pix] = f; + f -= 1; + float barycentric[3] = {0, 0, 0}; + if (f >= 0) { + float vt[2] = {float(pix % width) + 0.5f, float(pix / width) + 0.5f}; + float* vt0_ptr = V + (F[f * 3] * 4); + float* vt1_ptr = V + (F[f * 3 + 1] * 4); + float* vt2_ptr = V + (F[f * 3 + 2] * 4); + + float vt0[2] = {(vt0_ptr[0] / vt0_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt0_ptr[1] / vt0_ptr[3]) * (height - 1) + 0.5f}; + float vt1[2] = {(vt1_ptr[0] / vt1_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt1_ptr[1] / vt1_ptr[3]) * (height - 1) + 0.5f}; + float vt2[2] = {(vt2_ptr[0] / vt2_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt2_ptr[1] / vt2_ptr[3]) * (height - 1) + 0.5f}; + + calculateBarycentricCoordinate(vt0, vt1, vt2, vt, barycentric); + + barycentric[0] = barycentric[0] / vt0_ptr[3]; + barycentric[1] = barycentric[1] / vt1_ptr[3]; + barycentric[2] = barycentric[2] / vt2_ptr[3]; + float w = 1.0f / (barycentric[0] + barycentric[1] + barycentric[2]); + barycentric[0] *= w; + barycentric[1] *= w; + barycentric[2] *= w; + + } + barycentric_map[pix * 3] = barycentric[0]; + barycentric_map[pix * 3 + 1] = barycentric[1]; + barycentric_map[pix * 3 + 2] = barycentric[2]; +} + +void rasterizeImagecoordsKernelCPU(float* V, int* F, float* d, INT64* zbuffer, float occlusion_trunc, int width, int height, int num_vertices, int num_faces, int f) +{ + float* vt0_ptr = V + (F[f * 3] * 4); + float* vt1_ptr = V + (F[f * 3 + 1] * 4); + float* vt2_ptr = V + (F[f * 3 + 2] * 4); + + float vt0[3] = {(vt0_ptr[0] / vt0_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt0_ptr[1] / vt0_ptr[3]) * (height - 1) + 0.5f, vt0_ptr[2] / vt0_ptr[3] * 0.49999f + 0.5f}; + float vt1[3] = {(vt1_ptr[0] / vt1_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt1_ptr[1] / vt1_ptr[3]) * (height - 1) + 0.5f, vt1_ptr[2] / vt1_ptr[3] * 0.49999f + 0.5f}; + float vt2[3] = {(vt2_ptr[0] / vt2_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt2_ptr[1] / vt2_ptr[3]) * (height - 1) + 0.5f, vt2_ptr[2] / vt2_ptr[3] * 0.49999f + 0.5f}; + + rasterizeTriangleCPU(f, vt0, vt1, vt2, width, height, zbuffer, d, occlusion_trunc); +} + +std::vector rasterize_image_cpu(torch::Tensor V, torch::Tensor F, torch::Tensor D, + int width, int height, float occlusion_truncation, int use_depth_prior) +{ + int num_faces = F.size(0); + int num_vertices = V.size(0); + auto options = torch::TensorOptions().dtype(torch::kInt32).requires_grad(false); + auto INT64_options = torch::TensorOptions().dtype(torch::kInt64).requires_grad(false); + auto findices = torch::zeros({height, width}, options); + INT64 maxint = (INT64)MAXINT * (INT64)MAXINT + (MAXINT - 1); + auto z_min = torch::ones({height, width}, INT64_options) * (int64_t)maxint; + + if (!use_depth_prior) { + for (int i = 0; i < num_faces; ++i) { + rasterizeImagecoordsKernelCPU(V.data_ptr(), F.data_ptr(), 0, + (INT64*)z_min.data_ptr(), occlusion_truncation, width, height, num_vertices, num_faces, i); + } + } else { + for (int i = 0; i < num_faces; ++i) + rasterizeImagecoordsKernelCPU(V.data_ptr(), F.data_ptr(), D.data_ptr(), + (INT64*)z_min.data_ptr(), occlusion_truncation, width, height, num_vertices, num_faces, i); + } + + auto float_options = torch::TensorOptions().dtype(torch::kFloat32).requires_grad(false); + auto barycentric = torch::zeros({height, width, 3}, float_options); + for (int i = 0; i < width * height; ++i) + barycentricFromImgcoordCPU(V.data_ptr(), F.data_ptr(), + findices.data_ptr(), (INT64*)z_min.data_ptr(), width, height, num_vertices, num_faces, barycentric.data_ptr(), i); + + return {findices, barycentric}; +} + +std::vector rasterize_image(torch::Tensor V, torch::Tensor F, torch::Tensor D, + int width, int height, float occlusion_truncation, int use_depth_prior) +{ + int device_id = V.get_device(); + if (device_id == -1) + return rasterize_image_cpu(V, F, D, width, height, occlusion_truncation, use_depth_prior); + else + return rasterize_image_gpu(V, F, D, width, height, occlusion_truncation, use_depth_prior); +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("rasterize_image", &rasterize_image, "Custom image rasterization"); + m.def("build_hierarchy", &build_hierarchy, "Custom image rasterization"); + m.def("build_hierarchy_with_feat", &build_hierarchy_with_feat, "Custom image rasterization"); +} diff --git a/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.h b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.h new file mode 100644 index 0000000000000000000000000000000000000000..a1fa8ff2150cbf34644c5027a77f6400c8c9cdde --- /dev/null +++ b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.h @@ -0,0 +1,54 @@ +#ifndef RASTERIZER_H_ +#define RASTERIZER_H_ + +#include +#include +#include +#include // For CUDA context +#include +#define INT64 uint64_t +#define MAXINT 2147483647 + +__host__ __device__ inline float calculateSignedArea2(float* a, float* b, float* c) { + return ((c[0] - a[0]) * (b[1] - a[1]) - (b[0] - a[0]) * (c[1] - a[1])); +} + +__host__ __device__ inline void calculateBarycentricCoordinate(float* a, float* b, float* c, float* p, + float* barycentric) +{ + float beta_tri = calculateSignedArea2(a, p, c); + float gamma_tri = calculateSignedArea2(a, b, p); + float area = calculateSignedArea2(a, b, c); + if (area == 0) { + barycentric[0] = -1.0; + barycentric[1] = -1.0; + barycentric[2] = -1.0; + return; + } + float tri_inv = 1.0 / area; + float beta = beta_tri * tri_inv; + float gamma = gamma_tri * tri_inv; + float alpha = 1.0 - beta - gamma; + barycentric[0] = alpha; + barycentric[1] = beta; + barycentric[2] = gamma; +} + +__host__ __device__ inline bool isBarycentricCoordInBounds(float* barycentricCoord) { + return barycentricCoord[0] >= 0.0 && barycentricCoord[0] <= 1.0 && + barycentricCoord[1] >= 0.0 && barycentricCoord[1] <= 1.0 && + barycentricCoord[2] >= 0.0 && barycentricCoord[2] <= 1.0; +} + +std::vector rasterize_image_gpu(torch::Tensor V, torch::Tensor F, torch::Tensor D, + int width, int height, float occlusion_truncation, int use_depth_prior); + +std::vector> build_hierarchy(std::vector view_layer_positions, std::vector view_layer_normals, int num_level, int resolution); + +std::vector> build_hierarchy_with_feat( + std::vector view_layer_positions, + std::vector view_layer_normals, + std::vector view_layer_feats, + int num_level, int resolution); + +#endif \ No newline at end of file diff --git a/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer_gpu.cu b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer_gpu.cu new file mode 100644 index 0000000000000000000000000000000000000000..cc6f354c0e2801b9ac84ec4547845c8edb606a60 --- /dev/null +++ b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer_gpu.cu @@ -0,0 +1,127 @@ +#include "rasterizer.h" + +__device__ void rasterizeTriangleGPU(int idx, float* vt0, float* vt1, float* vt2, int width, int height, INT64* zbuffer, float* d, float occlusion_truncation) { + float x_min = std::min(vt0[0], std::min(vt1[0],vt2[0])); + float x_max = std::max(vt0[0], std::max(vt1[0],vt2[0])); + float y_min = std::min(vt0[1], std::min(vt1[1],vt2[1])); + float y_max = std::max(vt0[1], std::max(vt1[1],vt2[1])); + + for (int px = x_min; px < x_max + 1; ++px) { + if (px < 0 || px >= width) + continue; + for (int py = y_min; py < y_max + 1; ++py) { + if (py < 0 || py >= height) + continue; + float vt[2] = {px + 0.5f, py + 0.5f}; + float baryCentricCoordinate[3]; + calculateBarycentricCoordinate(vt0, vt1, vt2, vt, baryCentricCoordinate); + if (isBarycentricCoordInBounds(baryCentricCoordinate)) { + int pixel = py * width + px; + if (zbuffer == 0) { + atomicExch(&zbuffer[pixel], (INT64)(idx + 1)); + continue; + } + float depth = baryCentricCoordinate[0] * vt0[2] + baryCentricCoordinate[1] * vt1[2] + baryCentricCoordinate[2] * vt2[2]; + float depth_thres = 0; + if (d) { + depth_thres = d[pixel] * 0.49999f + 0.5f + occlusion_truncation; + } + + int z_quantize = depth * (2<<17); + INT64 token = (INT64)z_quantize * MAXINT + (INT64)(idx + 1); + if (depth < depth_thres) + continue; + atomicMin(&zbuffer[pixel], token); + } + } + } +} + +__global__ void barycentricFromImgcoordGPU(float* V, int* F, int* findices, INT64* zbuffer, int width, int height, int num_vertices, int num_faces, + float* barycentric_map) +{ + int pix = blockIdx.x * blockDim.x + threadIdx.x; + if (pix >= width * height) + return; + INT64 f = zbuffer[pix] % MAXINT; + if (f == (MAXINT-1)) { + findices[pix] = 0; + barycentric_map[pix * 3] = 0; + barycentric_map[pix * 3 + 1] = 0; + barycentric_map[pix * 3 + 2] = 0; + return; + } + findices[pix] = f; + f -= 1; + float barycentric[3] = {0, 0, 0}; + if (f >= 0) { + float vt[2] = {float(pix % width) + 0.5f, float(pix / width) + 0.5f}; + float* vt0_ptr = V + (F[f * 3] * 4); + float* vt1_ptr = V + (F[f * 3 + 1] * 4); + float* vt2_ptr = V + (F[f * 3 + 2] * 4); + + float vt0[2] = {(vt0_ptr[0] / vt0_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt0_ptr[1] / vt0_ptr[3]) * (height - 1) + 0.5f}; + float vt1[2] = {(vt1_ptr[0] / vt1_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt1_ptr[1] / vt1_ptr[3]) * (height - 1) + 0.5f}; + float vt2[2] = {(vt2_ptr[0] / vt2_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt2_ptr[1] / vt2_ptr[3]) * (height - 1) + 0.5f}; + + calculateBarycentricCoordinate(vt0, vt1, vt2, vt, barycentric); + + barycentric[0] = barycentric[0] / vt0_ptr[3]; + barycentric[1] = barycentric[1] / vt1_ptr[3]; + barycentric[2] = barycentric[2] / vt2_ptr[3]; + float w = 1.0f / (barycentric[0] + barycentric[1] + barycentric[2]); + barycentric[0] *= w; + barycentric[1] *= w; + barycentric[2] *= w; + + } + barycentric_map[pix * 3] = barycentric[0]; + barycentric_map[pix * 3 + 1] = barycentric[1]; + barycentric_map[pix * 3 + 2] = barycentric[2]; +} + +__global__ void rasterizeImagecoordsKernelGPU(float* V, int* F, float* d, INT64* zbuffer, float occlusion_trunc, int width, int height, int num_vertices, int num_faces) +{ + int f = blockIdx.x * blockDim.x + threadIdx.x; + if (f >= num_faces) + return; + + float* vt0_ptr = V + (F[f * 3] * 4); + float* vt1_ptr = V + (F[f * 3 + 1] * 4); + float* vt2_ptr = V + (F[f * 3 + 2] * 4); + + float vt0[3] = {(vt0_ptr[0] / vt0_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt0_ptr[1] / vt0_ptr[3]) * (height - 1) + 0.5f, vt0_ptr[2] / vt0_ptr[3] * 0.49999f + 0.5f}; + float vt1[3] = {(vt1_ptr[0] / vt1_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt1_ptr[1] / vt1_ptr[3]) * (height - 1) + 0.5f, vt1_ptr[2] / vt1_ptr[3] * 0.49999f + 0.5f}; + float vt2[3] = {(vt2_ptr[0] / vt2_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt2_ptr[1] / vt2_ptr[3]) * (height - 1) + 0.5f, vt2_ptr[2] / vt2_ptr[3] * 0.49999f + 0.5f}; + + rasterizeTriangleGPU(f, vt0, vt1, vt2, width, height, zbuffer, d, occlusion_trunc); +} + +std::vector rasterize_image_gpu(torch::Tensor V, torch::Tensor F, torch::Tensor D, + int width, int height, float occlusion_truncation, int use_depth_prior) +{ + int device_id = V.get_device(); + cudaSetDevice(device_id); + int num_faces = F.size(0); + int num_vertices = V.size(0); + auto options = torch::TensorOptions().dtype(torch::kInt32).device(torch::kCUDA, device_id).requires_grad(false); + auto INT64_options = torch::TensorOptions().dtype(torch::kInt64).device(torch::kCUDA, device_id).requires_grad(false); + auto findices = torch::zeros({height, width}, options); + INT64 maxint = (INT64)MAXINT * (INT64)MAXINT + (MAXINT - 1); + auto z_min = torch::ones({height, width}, INT64_options) * (int64_t)maxint; + + if (!use_depth_prior) { + rasterizeImagecoordsKernelGPU<<<(num_faces+255)/256,256,0,at::cuda::getCurrentCUDAStream()>>>(V.data_ptr(), F.data_ptr(), 0, + (INT64*)z_min.data_ptr(), occlusion_truncation, width, height, num_vertices, num_faces); + } else { + rasterizeImagecoordsKernelGPU<<<(num_faces+255)/256,256,0,at::cuda::getCurrentCUDAStream()>>>(V.data_ptr(), F.data_ptr(), D.data_ptr(), + (INT64*)z_min.data_ptr(), occlusion_truncation, width, height, num_vertices, num_faces); + } + + auto float_options = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA, device_id).requires_grad(false); + auto barycentric = torch::zeros({height, width, 3}, float_options); + barycentricFromImgcoordGPU<<<(width * height + 255)/256, 256>>>(V.data_ptr(), F.data_ptr(), + findices.data_ptr(), (INT64*)z_min.data_ptr(), width, height, num_vertices, num_faces, barycentric.data_ptr()); + + return {findices, barycentric}; +} diff --git a/hy3dgen/texgen/custom_rasterizer/setup.py b/hy3dgen/texgen/custom_rasterizer/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..3e312a7f45689753b5ba3ed4befff1fefecff6fd --- /dev/null +++ b/hy3dgen/texgen/custom_rasterizer/setup.py @@ -0,0 +1,26 @@ +from setuptools import setup, find_packages +from torch.utils.cpp_extension import BuildExtension, CUDAExtension + +# build custom rasterizer +# build with `python setup.py install` +# nvcc is needed + +custom_rasterizer_module = CUDAExtension('custom_rasterizer_kernel', [ + 'lib/custom_rasterizer_kernel/rasterizer.cpp', + 'lib/custom_rasterizer_kernel/grid_neighbor.cpp', + 'lib/custom_rasterizer_kernel/rasterizer_gpu.cu', +]) + +setup( + packages=find_packages(), + version='0.1', + name='custom_rasterizer', + include_package_data=True, + package_dir={'': '.'}, + ext_modules=[ + custom_rasterizer_module, + ], + cmdclass={ + 'build_ext': BuildExtension + } +) diff --git a/hy3dgen/texgen/differentiable_renderer/__init__.py b/hy3dgen/texgen/differentiable_renderer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e307c3f8c1292da02f308e4b59ef0bcd6fe7305e --- /dev/null +++ b/hy3dgen/texgen/differentiable_renderer/__init__.py @@ -0,0 +1,23 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. diff --git a/hy3dgen/texgen/differentiable_renderer/build/temp.win-amd64-cpython-310/Release/mesh_processor.cp310-win_amd64.exp b/hy3dgen/texgen/differentiable_renderer/build/temp.win-amd64-cpython-310/Release/mesh_processor.cp310-win_amd64.exp new file mode 100644 index 0000000000000000000000000000000000000000..cb7a9671b7e96564de44070afdced28da0f631b7 Binary files /dev/null and b/hy3dgen/texgen/differentiable_renderer/build/temp.win-amd64-cpython-310/Release/mesh_processor.cp310-win_amd64.exp differ diff --git a/hy3dgen/texgen/differentiable_renderer/build/temp.win-amd64-cpython-310/Release/mesh_processor.cp310-win_amd64.lib b/hy3dgen/texgen/differentiable_renderer/build/temp.win-amd64-cpython-310/Release/mesh_processor.cp310-win_amd64.lib new file mode 100644 index 0000000000000000000000000000000000000000..19b554dd00907fa3cacbf26d59f00247cd76985b Binary files /dev/null and b/hy3dgen/texgen/differentiable_renderer/build/temp.win-amd64-cpython-310/Release/mesh_processor.cp310-win_amd64.lib differ diff --git a/hy3dgen/texgen/differentiable_renderer/build/temp.win-amd64-cpython-310/Release/mesh_processor.obj b/hy3dgen/texgen/differentiable_renderer/build/temp.win-amd64-cpython-310/Release/mesh_processor.obj new file mode 100644 index 0000000000000000000000000000000000000000..318c2eddbb7c258091e2825e02abff7f65ef35b9 --- /dev/null +++ b/hy3dgen/texgen/differentiable_renderer/build/temp.win-amd64-cpython-310/Release/mesh_processor.obj @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1aa1f67f69a3f4389d88b5824de08503705112177eb5d8c7dd5ad09c2847e8b6 +size 7617045 diff --git a/hy3dgen/texgen/differentiable_renderer/camera_utils.py b/hy3dgen/texgen/differentiable_renderer/camera_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..289710ab787a174b39154f1010fc6209e4c92dfe --- /dev/null +++ b/hy3dgen/texgen/differentiable_renderer/camera_utils.py @@ -0,0 +1,116 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import math + +import numpy as np +import torch + + +def transform_pos(mtx, pos, keepdim=False): + t_mtx = torch.from_numpy(mtx).to( + pos.device) if isinstance( + mtx, np.ndarray) else mtx + if pos.shape[-1] == 3: + posw = torch.cat( + [pos, torch.ones([pos.shape[0], 1]).to(pos.device)], axis=1) + else: + posw = pos + + if keepdim: + return torch.matmul(posw, t_mtx.t())[...] + else: + return torch.matmul(posw, t_mtx.t())[None, ...] + + +def get_mv_matrix(elev, azim, camera_distance, center=None): + elev = -elev + azim += 90 + + elev_rad = math.radians(elev) + azim_rad = math.radians(azim) + + camera_position = np.array([camera_distance * math.cos(elev_rad) * math.cos(azim_rad), + camera_distance * + math.cos(elev_rad) * math.sin(azim_rad), + camera_distance * math.sin(elev_rad)]) + + if center is None: + center = np.array([0, 0, 0]) + else: + center = np.array(center) + + lookat = center - camera_position + lookat = lookat / np.linalg.norm(lookat) + + up = np.array([0, 0, 1.0]) + right = np.cross(lookat, up) + right = right / np.linalg.norm(right) + up = np.cross(right, lookat) + up = up / np.linalg.norm(up) + + c2w = np.concatenate( + [np.stack([right, up, -lookat], axis=-1), camera_position[:, None]], axis=-1) + + w2c = np.zeros((4, 4)) + w2c[:3, :3] = np.transpose(c2w[:3, :3], (1, 0)) + w2c[:3, 3:] = -np.matmul(np.transpose(c2w[:3, :3], (1, 0)), c2w[:3, 3:]) + w2c[3, 3] = 1.0 + + return w2c.astype(np.float32) + + +def get_orthographic_projection_matrix( + left=-1, right=1, bottom=-1, top=1, near=0, far=2): + """ + 计算正交投影矩阵。 + + 参数: + left (float): 投影区域左侧边界。 + right (float): 投影区域右侧边界。 + bottom (float): 投影区域底部边界。 + top (float): 投影区域顶部边界。 + near (float): 投影区域近裁剪面距离。 + far (float): 投影区域远裁剪面距离。 + + 返回: + numpy.ndarray: 正交投影矩阵。 + """ + ortho_matrix = np.eye(4, dtype=np.float32) + ortho_matrix[0, 0] = 2 / (right - left) + ortho_matrix[1, 1] = 2 / (top - bottom) + ortho_matrix[2, 2] = -2 / (far - near) + ortho_matrix[0, 3] = -(right + left) / (right - left) + ortho_matrix[1, 3] = -(top + bottom) / (top - bottom) + ortho_matrix[2, 3] = -(far + near) / (far - near) + return ortho_matrix + + +def get_perspective_projection_matrix(fovy, aspect_wh, near, far): + fovy_rad = math.radians(fovy) + return np.array([[1.0 / (math.tan(fovy_rad / 2.0) * aspect_wh), 0, 0, 0], + [0, 1.0 / math.tan(fovy_rad / 2.0), 0, 0], + [0, 0, -(far + near) / (far - near), - + 2.0 * far * near / (far - near)], + [0, 0, -1, 0]]).astype(np.float32) diff --git a/hy3dgen/texgen/differentiable_renderer/compile_mesh_painter.bat b/hy3dgen/texgen/differentiable_renderer/compile_mesh_painter.bat new file mode 100644 index 0000000000000000000000000000000000000000..3947b0f03f9f6245dac95db7460703076444a304 --- /dev/null +++ b/hy3dgen/texgen/differentiable_renderer/compile_mesh_painter.bat @@ -0,0 +1,3 @@ +FOR /F "tokens=*" %%i IN ('python -m pybind11 --includes') DO SET PYINCLUDES=%%i +echo %PYINCLUDES% +g++ -O3 -Wall -shared -std=c++11 -fPIC %PYINCLUDES% mesh_processor.cpp -o mesh_processor.pyd -lpython3.12 \ No newline at end of file diff --git a/hy3dgen/texgen/differentiable_renderer/dist/mesh_processor-0.0.0-py3.10-win-amd64.egg b/hy3dgen/texgen/differentiable_renderer/dist/mesh_processor-0.0.0-py3.10-win-amd64.egg new file mode 100644 index 0000000000000000000000000000000000000000..94aa03de74fc9b82fc5335e097d1c2f538610577 Binary files /dev/null and b/hy3dgen/texgen/differentiable_renderer/dist/mesh_processor-0.0.0-py3.10-win-amd64.egg differ diff --git a/hy3dgen/texgen/differentiable_renderer/mesh_processor.cpp b/hy3dgen/texgen/differentiable_renderer/mesh_processor.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ca8650fada02099d3fce0f551fa4f953f278cf34 --- /dev/null +++ b/hy3dgen/texgen/differentiable_renderer/mesh_processor.cpp @@ -0,0 +1,161 @@ +#include +#include +#include +#include +#include +#include +#include + +namespace py = pybind11; +using namespace std; + +std::pair, + py::array_t> meshVerticeInpaint_smooth(py::array_t texture, +py::array_t mask, + py::array_t vtx_pos, py::array_t vtx_uv, + py::array_t pos_idx, py::array_t uv_idx) { + auto texture_buf = texture.request(); + auto mask_buf = mask.request(); + auto vtx_pos_buf = vtx_pos.request(); + auto vtx_uv_buf = vtx_uv.request(); + auto pos_idx_buf = pos_idx.request(); + auto uv_idx_buf = uv_idx.request(); + + int texture_height = texture_buf.shape[0]; + int texture_width = texture_buf.shape[1]; + int texture_channel = texture_buf.shape[2]; + float* texture_ptr = static_cast(texture_buf.ptr); + uint8_t* mask_ptr = static_cast(mask_buf.ptr); + + int vtx_num = vtx_pos_buf.shape[0]; + float* vtx_pos_ptr = static_cast(vtx_pos_buf.ptr); + float* vtx_uv_ptr = static_cast(vtx_uv_buf.ptr); + int* pos_idx_ptr = static_cast(pos_idx_buf.ptr); + int* uv_idx_ptr = static_cast(uv_idx_buf.ptr); + + vector vtx_mask(vtx_num, 0.0f); + vector> vtx_color(vtx_num, vector(texture_channel, 0.0f)); + vector uncolored_vtxs; + + vector> G(vtx_num); + + for (int i = 0; i < uv_idx_buf.shape[0]; ++i) { + for (int k = 0; k < 3; ++k) { + int vtx_uv_idx = uv_idx_ptr[i * 3 + k]; + int vtx_idx = pos_idx_ptr[i * 3 + k]; + int uv_v = round(vtx_uv_ptr[vtx_uv_idx * 2] * (texture_width - 1)); + int uv_u = round((1.0 - vtx_uv_ptr[vtx_uv_idx * 2 + 1]) * (texture_height - 1)); + + if (mask_ptr[uv_u * texture_width + uv_v] > 0) { + vtx_mask[vtx_idx] = 1.0f; + for (int c = 0; c < texture_channel; ++c) { + vtx_color[vtx_idx][c] = texture_ptr[(uv_u * texture_width + uv_v) * texture_channel + c]; + } + }else{ + uncolored_vtxs.push_back(vtx_idx); + } + + G[pos_idx_ptr[i * 3 + k]].push_back(pos_idx_ptr[i * 3 + (k + 1) % 3]); + } + } + + int smooth_count = 2; + int last_uncolored_vtx_count = 0; + while (smooth_count>0) { + int uncolored_vtx_count = 0; + + for (int vtx_idx : uncolored_vtxs) { + + vector sum_color(texture_channel, 0.0f); + float total_weight = 0.0f; + + array vtx_0 = {vtx_pos_ptr[vtx_idx * 3], +vtx_pos_ptr[vtx_idx * 3 + 1], vtx_pos_ptr[vtx_idx * 3 + 2]}; + for (int connected_idx : G[vtx_idx]) { + if (vtx_mask[connected_idx] > 0) { + array vtx1 = {vtx_pos_ptr[connected_idx * 3], + vtx_pos_ptr[connected_idx * 3 + 1], vtx_pos_ptr[connected_idx * 3 + 2]}; + float dist_weight = 1.0f / max(sqrt(pow(vtx_0[0] - vtx1[0], 2) + pow(vtx_0[1] - vtx1[1], 2) + \ + pow(vtx_0[2] - vtx1[2], 2)), 1E-4); + dist_weight = dist_weight * dist_weight; + for (int c = 0; c < texture_channel; ++c) { + sum_color[c] += vtx_color[connected_idx][c] * dist_weight; + } + total_weight += dist_weight; + } + } + + if (total_weight > 0.0f) { + for (int c = 0; c < texture_channel; ++c) { + vtx_color[vtx_idx][c] = sum_color[c] / total_weight; + } + vtx_mask[vtx_idx] = 1.0f; + } else { + uncolored_vtx_count++; + } + + } + + if(last_uncolored_vtx_count==uncolored_vtx_count){ + smooth_count--; + }else{ + smooth_count++; + } + last_uncolored_vtx_count = uncolored_vtx_count; + } + + // Create new arrays for the output + py::array_t new_texture(texture_buf.size); + py::array_t new_mask(mask_buf.size); + + auto new_texture_buf = new_texture.request(); + auto new_mask_buf = new_mask.request(); + + float* new_texture_ptr = static_cast(new_texture_buf.ptr); + uint8_t* new_mask_ptr = static_cast(new_mask_buf.ptr); + // Copy original texture and mask to new arrays + std::copy(texture_ptr, texture_ptr + texture_buf.size, new_texture_ptr); + std::copy(mask_ptr, mask_ptr + mask_buf.size, new_mask_ptr); + + for (int face_idx = 0; face_idx < uv_idx_buf.shape[0]; ++face_idx) { + for (int k = 0; k < 3; ++k) { + int vtx_uv_idx = uv_idx_ptr[face_idx * 3 + k]; + int vtx_idx = pos_idx_ptr[face_idx * 3 + k]; + + if (vtx_mask[vtx_idx] == 1.0f) { + int uv_v = round(vtx_uv_ptr[vtx_uv_idx * 2] * (texture_width - 1)); + int uv_u = round((1.0 - vtx_uv_ptr[vtx_uv_idx * 2 + 1]) * (texture_height - 1)); + + for (int c = 0; c < texture_channel; ++c) { + new_texture_ptr[(uv_u * texture_width + uv_v) * texture_channel + c] = vtx_color[vtx_idx][c]; + } + new_mask_ptr[uv_u * texture_width + uv_v] = 255; + } + } + } + + // Reshape the new arrays to match the original texture and mask shapes + new_texture.resize({texture_height, texture_width, 3}); + new_mask.resize({texture_height, texture_width}); + return std::make_pair(new_texture, new_mask); +} + + +std::pair, py::array_t> meshVerticeInpaint(py::array_t texture, + py::array_t mask, + py::array_t vtx_pos, py::array_t vtx_uv, + py::array_t pos_idx, py::array_t uv_idx, const std::string& method = "smooth") { + if (method == "smooth") { + return meshVerticeInpaint_smooth(texture, mask, vtx_pos, vtx_uv, pos_idx, uv_idx); + } else { + throw std::invalid_argument("Invalid method. Use 'smooth' or 'forward'."); + } +} + +PYBIND11_MODULE(mesh_processor, m) { + m.def("meshVerticeInpaint", &meshVerticeInpaint, "A function to process mesh", + py::arg("texture"), py::arg("mask"), + py::arg("vtx_pos"), py::arg("vtx_uv"), + py::arg("pos_idx"), py::arg("uv_idx"), + py::arg("method") = "smooth"); +} \ No newline at end of file diff --git a/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/PKG-INFO b/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/PKG-INFO new file mode 100644 index 0000000000000000000000000000000000000000..ddb5e19214f697ef854a3c010d9e1e1e25a49702 --- /dev/null +++ b/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/PKG-INFO @@ -0,0 +1,7 @@ +Metadata-Version: 2.2 +Name: mesh_processor +Version: 0.0.0 +Requires-Python: >=3.6 +Requires-Dist: pybind11>=2.6.0 +Dynamic: requires-dist +Dynamic: requires-python diff --git a/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/SOURCES.txt b/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/SOURCES.txt new file mode 100644 index 0000000000000000000000000000000000000000..0ca24855f9323bfe0f20a2fab4dc2f55e6e34079 --- /dev/null +++ b/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/SOURCES.txt @@ -0,0 +1,7 @@ +mesh_processor.cpp +setup.py +mesh_processor.egg-info/PKG-INFO +mesh_processor.egg-info/SOURCES.txt +mesh_processor.egg-info/dependency_links.txt +mesh_processor.egg-info/requires.txt +mesh_processor.egg-info/top_level.txt \ No newline at end of file diff --git a/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/dependency_links.txt b/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/dependency_links.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/requires.txt b/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/requires.txt new file mode 100644 index 0000000000000000000000000000000000000000..d89789fcaa28db9e76d59597b04095a0a9f99fa3 --- /dev/null +++ b/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/requires.txt @@ -0,0 +1 @@ +pybind11>=2.6.0 diff --git a/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/top_level.txt b/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/top_level.txt new file mode 100644 index 0000000000000000000000000000000000000000..ccd72df0d4e79e7f3ee7e8ad3728d300bde6c3fe --- /dev/null +++ b/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/top_level.txt @@ -0,0 +1 @@ +mesh_processor diff --git a/hy3dgen/texgen/differentiable_renderer/mesh_processor.py b/hy3dgen/texgen/differentiable_renderer/mesh_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..a96955c19757df5ad18095b33829962140c04647 --- /dev/null +++ b/hy3dgen/texgen/differentiable_renderer/mesh_processor.py @@ -0,0 +1,70 @@ +import numpy as np + +def meshVerticeInpaint_smooth(texture, mask, vtx_pos, vtx_uv, pos_idx, uv_idx): + texture_height, texture_width, texture_channel = texture.shape + vtx_num = vtx_pos.shape[0] + + vtx_mask = np.zeros(vtx_num, dtype=np.float32) + vtx_color = [np.zeros(texture_channel, dtype=np.float32) for _ in range(vtx_num)] + uncolored_vtxs = [] + G = [[] for _ in range(vtx_num)] + + for i in range(uv_idx.shape[0]): + for k in range(3): + vtx_uv_idx = uv_idx[i, k] + vtx_idx = pos_idx[i, k] + uv_v = int(round(vtx_uv[vtx_uv_idx, 0] * (texture_width - 1))) + uv_u = int(round((1.0 - vtx_uv[vtx_uv_idx, 1]) * (texture_height - 1))) + if mask[uv_u, uv_v] > 0: + vtx_mask[vtx_idx] = 1.0 + vtx_color[vtx_idx] = texture[uv_u, uv_v] + else: + uncolored_vtxs.append(vtx_idx) + G[pos_idx[i, k]].append(pos_idx[i, (k + 1) % 3]) + + smooth_count = 2 + last_uncolored_vtx_count = 0 + while smooth_count > 0: + uncolored_vtx_count = 0 + for vtx_idx in uncolored_vtxs: + sum_color = np.zeros(texture_channel, dtype=np.float32) + total_weight = 0.0 + vtx_0 = vtx_pos[vtx_idx] + for connected_idx in G[vtx_idx]: + if vtx_mask[connected_idx] > 0: + vtx1 = vtx_pos[connected_idx] + dist = np.sqrt(np.sum((vtx_0 - vtx1) ** 2)) + dist_weight = 1.0 / max(dist, 1e-4) + dist_weight *= dist_weight + sum_color += vtx_color[connected_idx] * dist_weight + total_weight += dist_weight + if total_weight > 0: + vtx_color[vtx_idx] = sum_color / total_weight + vtx_mask[vtx_idx] = 1.0 + else: + uncolored_vtx_count += 1 + + if last_uncolored_vtx_count == uncolored_vtx_count: + smooth_count -= 1 + else: + smooth_count += 1 + last_uncolored_vtx_count = uncolored_vtx_count + + new_texture = texture.copy() + new_mask = mask.copy() + for face_idx in range(uv_idx.shape[0]): + for k in range(3): + vtx_uv_idx = uv_idx[face_idx, k] + vtx_idx = pos_idx[face_idx, k] + if vtx_mask[vtx_idx] == 1.0: + uv_v = int(round(vtx_uv[vtx_uv_idx, 0] * (texture_width - 1))) + uv_u = int(round((1.0 - vtx_uv[vtx_uv_idx, 1]) * (texture_height - 1))) + new_texture[uv_u, uv_v] = vtx_color[vtx_idx] + new_mask[uv_u, uv_v] = 255 + return new_texture, new_mask + +def meshVerticeInpaint(texture, mask, vtx_pos, vtx_uv, pos_idx, uv_idx, method="smooth"): + if method == "smooth": + return meshVerticeInpaint_smooth(texture, mask, vtx_pos, vtx_uv, pos_idx, uv_idx) + else: + raise ValueError("Invalid method. Use 'smooth' or 'forward'.") \ No newline at end of file diff --git a/hy3dgen/texgen/differentiable_renderer/mesh_render.py b/hy3dgen/texgen/differentiable_renderer/mesh_render.py new file mode 100644 index 0000000000000000000000000000000000000000..c85b80e043221282e9ff6bfb81764fb32c5d48ed --- /dev/null +++ b/hy3dgen/texgen/differentiable_renderer/mesh_render.py @@ -0,0 +1,833 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import cv2 +import numpy as np +import torch +import torch.nn.functional as F +import trimesh +from PIL import Image + +from .camera_utils import ( + transform_pos, + get_mv_matrix, + get_orthographic_projection_matrix, + get_perspective_projection_matrix, +) +from .mesh_processor import meshVerticeInpaint +from .mesh_utils import load_mesh, save_mesh + + +def stride_from_shape(shape): + stride = [1] + for x in reversed(shape[1:]): + stride.append(stride[-1] * x) + return list(reversed(stride)) + + +def scatter_add_nd_with_count(input, count, indices, values, weights=None): + # input: [..., C], D dimension + C channel + # count: [..., 1], D dimension + # indices: [N, D], long + # values: [N, C] + + D = indices.shape[-1] + C = input.shape[-1] + size = input.shape[:-1] + stride = stride_from_shape(size) + + assert len(size) == D + + input = input.view(-1, C) # [HW, C] + count = count.view(-1, 1) + + flatten_indices = (indices * torch.tensor(stride, + dtype=torch.long, device=indices.device)).sum(-1) # [N] + + if weights is None: + weights = torch.ones_like(values[..., :1]) + + input.scatter_add_(0, flatten_indices.unsqueeze(1).repeat(1, C), values) + count.scatter_add_(0, flatten_indices.unsqueeze(1), weights) + + return input.view(*size, C), count.view(*size, 1) + + +def linear_grid_put_2d(H, W, coords, values, return_count=False): + # coords: [N, 2], float in [0, 1] + # values: [N, C] + + C = values.shape[-1] + + indices = coords * torch.tensor( + [H - 1, W - 1], dtype=torch.float32, device=coords.device + ) + indices_00 = indices.floor().long() # [N, 2] + indices_00[:, 0].clamp_(0, H - 2) + indices_00[:, 1].clamp_(0, W - 2) + indices_01 = indices_00 + torch.tensor( + [0, 1], dtype=torch.long, device=indices.device + ) + indices_10 = indices_00 + torch.tensor( + [1, 0], dtype=torch.long, device=indices.device + ) + indices_11 = indices_00 + torch.tensor( + [1, 1], dtype=torch.long, device=indices.device + ) + + h = indices[..., 0] - indices_00[..., 0].float() + w = indices[..., 1] - indices_00[..., 1].float() + w_00 = (1 - h) * (1 - w) + w_01 = (1 - h) * w + w_10 = h * (1 - w) + w_11 = h * w + + result = torch.zeros(H, W, C, device=values.device, + dtype=values.dtype) # [H, W, C] + count = torch.zeros(H, W, 1, device=values.device, + dtype=values.dtype) # [H, W, 1] + weights = torch.ones_like(values[..., :1]) # [N, 1] + + result, count = scatter_add_nd_with_count( + result, count, indices_00, values * w_00.unsqueeze(1), weights * w_00.unsqueeze(1)) + result, count = scatter_add_nd_with_count( + result, count, indices_01, values * w_01.unsqueeze(1), weights * w_01.unsqueeze(1)) + result, count = scatter_add_nd_with_count( + result, count, indices_10, values * w_10.unsqueeze(1), weights * w_10.unsqueeze(1)) + result, count = scatter_add_nd_with_count( + result, count, indices_11, values * w_11.unsqueeze(1), weights * w_11.unsqueeze(1)) + + if return_count: + return result, count + + mask = (count.squeeze(-1) > 0) + result[mask] = result[mask] / count[mask].repeat(1, C) + + return result + + +class MeshRender(): + def __init__( + self, + camera_distance=1.45, camera_type='orth', + default_resolution=1024, texture_size=1024, + use_antialias=True, max_mip_level=None, filter_mode='linear', + bake_mode='linear', raster_mode='cr', device='cuda'): + + self.device = device + + self.set_default_render_resolution(default_resolution) + self.set_default_texture_resolution(texture_size) + + self.camera_distance = camera_distance + self.use_antialias = use_antialias + self.max_mip_level = max_mip_level + self.filter_mode = filter_mode + + self.bake_angle_thres = 75 + self.bake_unreliable_kernel_size = int( + (2 / 512) * max(self.default_resolution[0], self.default_resolution[1])) + self.bake_mode = bake_mode + + self.raster_mode = raster_mode + if self.raster_mode == 'cr': + import custom_rasterizer as cr + self.raster = cr + else: + raise f'No raster named {self.raster_mode}' + + if camera_type == 'orth': + self.ortho_scale = 1.2 + self.camera_proj_mat = get_orthographic_projection_matrix( + left=-self.ortho_scale * 0.5, right=self.ortho_scale * 0.5, + bottom=-self.ortho_scale * 0.5, top=self.ortho_scale * 0.5, + near=0.1, far=100 + ) + elif camera_type == 'perspective': + self.camera_proj_mat = get_perspective_projection_matrix( + 49.13, self.default_resolution[1] / self.default_resolution[0], + 0.01, 100.0 + ) + else: + raise f'No camera type {camera_type}' + + def raster_rasterize(self, pos, tri, resolution, ranges=None, grad_db=True): + + if self.raster_mode == 'cr': + rast_out_db = None + if pos.dim() == 2: + pos = pos.unsqueeze(0) + findices, barycentric = self.raster.rasterize(pos, tri, resolution) + rast_out = torch.cat((barycentric, findices.unsqueeze(-1)), dim=-1) + rast_out = rast_out.unsqueeze(0) + else: + raise f'No raster named {self.raster_mode}' + + return rast_out, rast_out_db + + def raster_interpolate(self, uv, rast_out, uv_idx, rast_db=None, diff_attrs=None): + + if self.raster_mode == 'cr': + textd = None + barycentric = rast_out[0, ..., :-1] + findices = rast_out[0, ..., -1] + if uv.dim() == 2: + uv = uv.unsqueeze(0) + textc = self.raster.interpolate(uv, findices, barycentric, uv_idx) + else: + raise f'No raster named {self.raster_mode}' + + return textc, textd + + def raster_texture(self, tex, uv, uv_da=None, mip_level_bias=None, mip=None, filter_mode='auto', + boundary_mode='wrap', max_mip_level=None): + + if self.raster_mode == 'cr': + raise f'Texture is not implemented in cr' + else: + raise f'No raster named {self.raster_mode}' + + return color + + def raster_antialias(self, color, rast, pos, tri, topology_hash=None, pos_gradient_boost=1.0): + + if self.raster_mode == 'cr': + # Antialias has not been supported yet + color = color + else: + raise f'No raster named {self.raster_mode}' + + return color + + def load_mesh( + self, + mesh, + scale_factor=1.15, + auto_center=True, + ): + vtx_pos, pos_idx, vtx_uv, uv_idx, texture_data = load_mesh(mesh) + self.mesh_copy = mesh + self.set_mesh(vtx_pos, pos_idx, + vtx_uv=vtx_uv, uv_idx=uv_idx, + scale_factor=scale_factor, auto_center=auto_center + ) + if texture_data is not None: + self.set_texture(texture_data) + + def save_mesh(self): + texture_data = self.get_texture() + texture_data = Image.fromarray((texture_data * 255).astype(np.uint8)) + return save_mesh(self.mesh_copy, texture_data) + + def set_mesh( + self, + vtx_pos, pos_idx, + vtx_uv=None, uv_idx=None, + scale_factor=1.15, auto_center=True + ): + + self.vtx_pos = torch.from_numpy(vtx_pos).to(self.device).float() + self.pos_idx = torch.from_numpy(pos_idx).to(self.device).to(torch.int) + if (vtx_uv is not None) and (uv_idx is not None): + self.vtx_uv = torch.from_numpy(vtx_uv).to(self.device).float() + self.uv_idx = torch.from_numpy(uv_idx).to(self.device).to(torch.int) + else: + self.vtx_uv = None + self.uv_idx = None + + self.vtx_pos[:, [0, 1]] = -self.vtx_pos[:, [0, 1]] + self.vtx_pos[:, [1, 2]] = self.vtx_pos[:, [2, 1]] + if (vtx_uv is not None) and (uv_idx is not None): + self.vtx_uv[:, 1] = 1.0 - self.vtx_uv[:, 1] + + if auto_center: + max_bb = (self.vtx_pos - 0).max(0)[0] + min_bb = (self.vtx_pos - 0).min(0)[0] + center = (max_bb + min_bb) / 2 + scale = torch.norm(self.vtx_pos - center, dim=1).max() * 2.0 + self.vtx_pos = (self.vtx_pos - center) * \ + (scale_factor / float(scale)) + self.scale_factor = scale_factor + + def set_texture(self, tex): + if isinstance(tex, np.ndarray): + tex = Image.fromarray((tex * 255).astype(np.uint8)) + elif isinstance(tex, torch.Tensor): + tex = tex.cpu().numpy() + tex = Image.fromarray((tex * 255).astype(np.uint8)) + + tex = tex.resize(self.texture_size).convert('RGB') + tex = np.array(tex) / 255.0 + self.tex = torch.from_numpy(tex).to(self.device) + self.tex = self.tex.float() + + def set_default_render_resolution(self, default_resolution): + if isinstance(default_resolution, int): + default_resolution = (default_resolution, default_resolution) + self.default_resolution = default_resolution + + def set_default_texture_resolution(self, texture_size): + if isinstance(texture_size, int): + texture_size = (texture_size, texture_size) + self.texture_size = texture_size + + def get_mesh(self): + vtx_pos = self.vtx_pos.cpu().numpy() + pos_idx = self.pos_idx.cpu().numpy() + vtx_uv = self.vtx_uv.cpu().numpy() + uv_idx = self.uv_idx.cpu().numpy() + + # 坐标变换的逆变换 + vtx_pos[:, [1, 2]] = vtx_pos[:, [2, 1]] + vtx_pos[:, [0, 1]] = -vtx_pos[:, [0, 1]] + + vtx_uv[:, 1] = 1.0 - vtx_uv[:, 1] + return vtx_pos, pos_idx, vtx_uv, uv_idx + + def get_texture(self): + return self.tex.cpu().numpy() + + def to(self, device): + self.device = device + + for attr_name in dir(self): + attr_value = getattr(self, attr_name) + if isinstance(attr_value, torch.Tensor): + setattr(self, attr_name, attr_value.to(self.device)) + + def color_rgb_to_srgb(self, image): + if isinstance(image, Image.Image): + image_rgb = torch.tesnor( + np.array(image) / + 255.0).float().to( + self.device) + elif isinstance(image, np.ndarray): + image_rgb = torch.tensor(image).float() + else: + image_rgb = image.to(self.device) + + image_srgb = torch.where( + image_rgb <= 0.0031308, + 12.92 * image_rgb, + 1.055 * torch.pow(image_rgb, 1 / 2.4) - 0.055 + ) + + if isinstance(image, Image.Image): + image_srgb = Image.fromarray( + (image_srgb.cpu().numpy() * + 255).astype( + np.uint8)) + elif isinstance(image, np.ndarray): + image_srgb = image_srgb.cpu().numpy() + else: + image_srgb = image_srgb.to(image.device) + + return image_srgb + + def _render( + self, + glctx, + mvp, + pos, + pos_idx, + uv, + uv_idx, + tex, + resolution, + max_mip_level, + keep_alpha, + filter_mode + ): + pos_clip = transform_pos(mvp, pos) + if isinstance(resolution, (int, float)): + resolution = [resolution, resolution] + rast_out, rast_out_db = self.raster_rasterize( + glctx, pos_clip, pos_idx, resolution=resolution) + + tex = tex.contiguous() + if filter_mode == 'linear-mipmap-linear': + texc, texd = self.raster_interpolate( + uv[None, ...], rast_out, uv_idx, rast_db=rast_out_db, diff_attrs='all') + color = self.raster_texture( + tex[None, ...], texc, texd, filter_mode='linear-mipmap-linear', max_mip_level=max_mip_level) + else: + texc, _ = self.raster_interpolate(uv[None, ...], rast_out, uv_idx) + color = self.raster_texture(tex[None, ...], texc, filter_mode=filter_mode) + + visible_mask = torch.clamp(rast_out[..., -1:], 0, 1) + color = color * visible_mask # Mask out background. + if self.use_antialias: + color = self.raster_antialias(color, rast_out, pos_clip, pos_idx) + + if keep_alpha: + color = torch.cat([color, visible_mask], dim=-1) + return color[0, ...] + + def render( + self, + elev, + azim, + camera_distance=None, + center=None, + resolution=None, + tex=None, + keep_alpha=True, + bgcolor=None, + filter_mode=None, + return_type='th' + ): + + proj = self.camera_proj_mat + r_mv = get_mv_matrix( + elev=elev, + azim=azim, + camera_distance=self.camera_distance if camera_distance is None else camera_distance, + center=center) + r_mvp = np.matmul(proj, r_mv).astype(np.float32) + if tex is not None: + if isinstance(tex, Image.Image): + tex = torch.tensor(np.array(tex) / 255.0) + elif isinstance(tex, np.ndarray): + tex = torch.tensor(tex) + if tex.dim() == 2: + tex = tex.unsqueeze(-1) + tex = tex.float().to(self.device) + image = self._render(r_mvp, self.vtx_pos, self.pos_idx, self.vtx_uv, self.uv_idx, + self.tex if tex is None else tex, + self.default_resolution if resolution is None else resolution, + self.max_mip_level, True, filter_mode if filter_mode else self.filter_mode) + mask = (image[..., [-1]] == 1).float() + if bgcolor is None: + bgcolor = [0 for _ in range(image.shape[-1] - 1)] + image = image * mask + (1 - mask) * \ + torch.tensor(bgcolor + [0]).to(self.device) + if keep_alpha == False: + image = image[..., :-1] + if return_type == 'np': + image = image.cpu().numpy() + elif return_type == 'pl': + image = image.squeeze(-1).cpu().numpy() * 255 + image = Image.fromarray(image.astype(np.uint8)) + return image + + def render_normal( + self, + elev, + azim, + camera_distance=None, + center=None, + resolution=None, + bg_color=[1, 1, 1], + use_abs_coor=False, + normalize_rgb=True, + return_type='th' + ): + + pos_camera, pos_clip = self.get_pos_from_mvp(elev, azim, camera_distance, center) + if resolution is None: + resolution = self.default_resolution + if isinstance(resolution, (int, float)): + resolution = [resolution, resolution] + rast_out, rast_out_db = self.raster_rasterize( + pos_clip, self.pos_idx, resolution=resolution) + + if use_abs_coor: + mesh_triangles = self.vtx_pos[self.pos_idx[:, :3], :] + else: + pos_camera = pos_camera[:, :3] / pos_camera[:, 3:4] + mesh_triangles = pos_camera[self.pos_idx[:, :3], :] + face_normals = F.normalize( + torch.cross(mesh_triangles[:, + 1, + :] - mesh_triangles[:, + 0, + :], + mesh_triangles[:, + 2, + :] - mesh_triangles[:, + 0, + :], + dim=-1), + dim=-1) + + vertex_normals = trimesh.geometry.mean_vertex_normals(vertex_count=self.vtx_pos.shape[0], + faces=self.pos_idx.cpu(), + face_normals=face_normals.cpu(), ) + vertex_normals = torch.from_numpy( + vertex_normals).float().to(self.device).contiguous() + + # Interpolate normal values across the rasterized pixels + normal, _ = self.raster_interpolate( + vertex_normals[None, ...], rast_out, self.pos_idx) + + visible_mask = torch.clamp(rast_out[..., -1:], 0, 1) + normal = normal * visible_mask + \ + torch.tensor(bg_color, dtype=torch.float32, device=self.device) * (1 - + visible_mask) # Mask out background. + + if normalize_rgb: + normal = (normal + 1) * 0.5 + if self.use_antialias: + normal = self.raster_antialias(normal, rast_out, pos_clip, self.pos_idx) + + image = normal[0, ...] + if return_type == 'np': + image = image.cpu().numpy() + elif return_type == 'pl': + image = image.cpu().numpy() * 255 + image = Image.fromarray(image.astype(np.uint8)) + + return image + + def convert_normal_map(self, image): + # blue is front, red is left, green is top + if isinstance(image, Image.Image): + image = np.array(image) + mask = (image == [255, 255, 255]).all(axis=-1) + + image = (image / 255.0) * 2.0 - 1.0 + + image[..., [1]] = -image[..., [1]] + image[..., [1, 2]] = image[..., [2, 1]] + image[..., [0]] = -image[..., [0]] + + image = (image + 1.0) * 0.5 + + image = (image * 255).astype(np.uint8) + image[mask] = [127, 127, 255] + + return Image.fromarray(image) + + def get_pos_from_mvp(self, elev, azim, camera_distance, center): + proj = self.camera_proj_mat + r_mv = get_mv_matrix( + elev=elev, + azim=azim, + camera_distance=self.camera_distance if camera_distance is None else camera_distance, + center=center) + + pos_camera = transform_pos(r_mv, self.vtx_pos, keepdim=True) + pos_clip = transform_pos(proj, pos_camera) + + return pos_camera, pos_clip + + def render_depth( + self, + elev, + azim, + camera_distance=None, + center=None, + resolution=None, + return_type='th' + ): + pos_camera, pos_clip = self.get_pos_from_mvp(elev, azim, camera_distance, center) + + if resolution is None: + resolution = self.default_resolution + if isinstance(resolution, (int, float)): + resolution = [resolution, resolution] + rast_out, rast_out_db = self.raster_rasterize( + pos_clip, self.pos_idx, resolution=resolution) + + pos_camera = pos_camera[:, :3] / pos_camera[:, 3:4] + tex_depth = pos_camera[:, 2].reshape(1, -1, 1).contiguous() + + # Interpolate depth values across the rasterized pixels + depth, _ = self.raster_interpolate(tex_depth, rast_out, self.pos_idx) + + visible_mask = torch.clamp(rast_out[..., -1:], 0, 1) + depth_max, depth_min = depth[visible_mask > + 0].max(), depth[visible_mask > 0].min() + depth = (depth - depth_min) / (depth_max - depth_min) + + depth = depth * visible_mask # Mask out background. + if self.use_antialias: + depth = self.raster_antialias(depth, rast_out, pos_clip, self.pos_idx) + + image = depth[0, ...] + if return_type == 'np': + image = image.cpu().numpy() + elif return_type == 'pl': + image = image.squeeze(-1).cpu().numpy() * 255 + image = Image.fromarray(image.astype(np.uint8)) + return image + + def render_position(self, elev, azim, camera_distance=None, center=None, + resolution=None, bg_color=[1, 1, 1], return_type='th'): + pos_camera, pos_clip = self.get_pos_from_mvp(elev, azim, camera_distance, center) + if resolution is None: + resolution = self.default_resolution + if isinstance(resolution, (int, float)): + resolution = [resolution, resolution] + rast_out, rast_out_db = self.raster_rasterize( + pos_clip, self.pos_idx, resolution=resolution) + + tex_position = 0.5 - self.vtx_pos[:, :3] / self.scale_factor + tex_position = tex_position.contiguous() + + # Interpolate depth values across the rasterized pixels + position, _ = self.raster_interpolate( + tex_position[None, ...], rast_out, self.pos_idx) + + visible_mask = torch.clamp(rast_out[..., -1:], 0, 1) + + position = position * visible_mask + \ + torch.tensor(bg_color, dtype=torch.float32, device=self.device) * (1 - + visible_mask) # Mask out background. + if self.use_antialias: + position = self.raster_antialias(position, rast_out, pos_clip, self.pos_idx) + + image = position[0, ...] + + if return_type == 'np': + image = image.cpu().numpy() + elif return_type == 'pl': + image = image.squeeze(-1).cpu().numpy() * 255 + image = Image.fromarray(image.astype(np.uint8)) + return image + + def render_uvpos(self, return_type='th'): + image = self.uv_feature_map(self.vtx_pos * 0.5 + 0.5) + if return_type == 'np': + image = image.cpu().numpy() + elif return_type == 'pl': + image = image.cpu().numpy() * 255 + image = Image.fromarray(image.astype(np.uint8)) + return image + + def uv_feature_map(self, vert_feat, bg=None): + vtx_uv = self.vtx_uv * 2 - 1.0 + vtx_uv = torch.cat( + [vtx_uv, torch.zeros_like(self.vtx_uv)], dim=1).unsqueeze(0) + vtx_uv[..., -1] = 1 + uv_idx = self.uv_idx + rast_out, rast_out_db = self.raster_rasterize( + vtx_uv, uv_idx, resolution=self.texture_size) + feat_map, _ = self.raster_interpolate(vert_feat[None, ...], rast_out, uv_idx) + feat_map = feat_map[0, ...] + if bg is not None: + visible_mask = torch.clamp(rast_out[..., -1:], 0, 1)[0, ...] + feat_map[visible_mask == 0] = bg + return feat_map + + def render_sketch_from_geometry(self, normal_image, depth_image): + normal_image_np = normal_image.cpu().numpy() + depth_image_np = depth_image.cpu().numpy() + + normal_image_np = (normal_image_np * 255).astype(np.uint8) + depth_image_np = (depth_image_np * 255).astype(np.uint8) + normal_image_np = cv2.cvtColor(normal_image_np, cv2.COLOR_RGB2GRAY) + + normal_edges = cv2.Canny(normal_image_np, 80, 150) + depth_edges = cv2.Canny(depth_image_np, 30, 80) + + combined_edges = np.maximum(normal_edges, depth_edges) + + sketch_image = torch.from_numpy(combined_edges).to( + normal_image.device).float() / 255.0 + sketch_image = sketch_image.unsqueeze(-1) + + return sketch_image + + def render_sketch_from_depth(self, depth_image): + depth_image_np = depth_image.cpu().numpy() + depth_image_np = (depth_image_np * 255).astype(np.uint8) + depth_edges = cv2.Canny(depth_image_np, 30, 80) + combined_edges = depth_edges + sketch_image = torch.from_numpy(combined_edges).to( + depth_image.device).float() / 255.0 + sketch_image = sketch_image.unsqueeze(-1) + return sketch_image + + def back_project(self, image, elev, azim, + camera_distance=None, center=None, method=None): + if isinstance(image, Image.Image): + image = torch.tensor(np.array(image) / 255.0) + elif isinstance(image, np.ndarray): + image = torch.tensor(image) + if image.dim() == 2: + image = image.unsqueeze(-1) + image = image.float().to(self.device) + resolution = image.shape[:2] + channel = image.shape[-1] + texture = torch.zeros(self.texture_size + (channel,)).to(self.device) + cos_map = torch.zeros(self.texture_size + (1,)).to(self.device) + + proj = self.camera_proj_mat + r_mv = get_mv_matrix( + elev=elev, + azim=azim, + camera_distance=self.camera_distance if camera_distance is None else camera_distance, + center=center) + pos_camera = transform_pos(r_mv, self.vtx_pos, keepdim=True) + pos_clip = transform_pos(proj, pos_camera) + pos_camera = pos_camera[:, :3] / pos_camera[:, 3:4] + v0 = pos_camera[self.pos_idx[:, 0], :] + v1 = pos_camera[self.pos_idx[:, 1], :] + v2 = pos_camera[self.pos_idx[:, 2], :] + face_normals = F.normalize( + torch.cross( + v1 - v0, + v2 - v0, + dim=-1), + dim=-1) + vertex_normals = trimesh.geometry.mean_vertex_normals(vertex_count=self.vtx_pos.shape[0], + faces=self.pos_idx.cpu(), + face_normals=face_normals.cpu(), ) + vertex_normals = torch.from_numpy( + vertex_normals).float().to(self.device).contiguous() + tex_depth = pos_camera[:, 2].reshape(1, -1, 1).contiguous() + rast_out, rast_out_db = self.raster_rasterize( + pos_clip, self.pos_idx, resolution=resolution) + visible_mask = torch.clamp(rast_out[..., -1:], 0, 1)[0, ...] + + normal, _ = self.raster_interpolate( + vertex_normals[None, ...], rast_out, self.pos_idx) + normal = normal[0, ...] + uv, _ = self.raster_interpolate(self.vtx_uv[None, ...], rast_out, self.uv_idx) + depth, _ = self.raster_interpolate(tex_depth, rast_out, self.pos_idx) + depth = depth[0, ...] + + depth_max, depth_min = depth[visible_mask > + 0].max(), depth[visible_mask > 0].min() + depth_normalized = (depth - depth_min) / (depth_max - depth_min) + depth_image = depth_normalized * visible_mask # Mask out background. + + sketch_image = self.render_sketch_from_depth(depth_image) + + lookat = torch.tensor([[0, 0, -1]], device=self.device) + cos_image = torch.nn.functional.cosine_similarity( + lookat, normal.view(-1, 3)) + cos_image = cos_image.view(normal.shape[0], normal.shape[1], 1) + + cos_thres = np.cos(self.bake_angle_thres / 180 * np.pi) + cos_image[cos_image < cos_thres] = 0 + + # shrink + kernel_size = self.bake_unreliable_kernel_size * 2 + 1 + kernel = torch.ones( + (1, 1, kernel_size, kernel_size), dtype=torch.float32).to( + sketch_image.device) + + visible_mask = visible_mask.permute(2, 0, 1).unsqueeze(0).float() + visible_mask = F.conv2d( + 1.0 - visible_mask, + kernel, + padding=kernel_size // 2) + visible_mask = 1.0 - (visible_mask > 0).float() # 二值化 + visible_mask = visible_mask.squeeze(0).permute(1, 2, 0) + + sketch_image = sketch_image.permute(2, 0, 1).unsqueeze(0) + sketch_image = F.conv2d(sketch_image, kernel, padding=kernel_size // 2) + sketch_image = (sketch_image > 0).float() # 二值化 + sketch_image = sketch_image.squeeze(0).permute(1, 2, 0) + visible_mask = visible_mask * (sketch_image < 0.5) + + cos_image[visible_mask == 0] = 0 + + method = self.bake_mode if method is None else method + + if method == 'linear': + proj_mask = (visible_mask != 0).view(-1) + uv = uv.squeeze(0).contiguous().view(-1, 2)[proj_mask] + image = image.squeeze(0).contiguous().view(-1, channel)[proj_mask] + cos_image = cos_image.contiguous().view(-1, 1)[proj_mask] + sketch_image = sketch_image.contiguous().view(-1, 1)[proj_mask] + + texture = linear_grid_put_2d( + self.texture_size[1], self.texture_size[0], uv[..., [1, 0]], image) + cos_map = linear_grid_put_2d( + self.texture_size[1], self.texture_size[0], uv[..., [1, 0]], cos_image) + boundary_map = linear_grid_put_2d( + self.texture_size[1], self.texture_size[0], uv[..., [1, 0]], sketch_image) + else: + raise f'No bake mode {method}' + + return texture, cos_map, boundary_map + + def bake_texture(self, colors, elevs, azims, + camera_distance=None, center=None, exp=6, weights=None): + for i in range(len(colors)): + if isinstance(colors[i], Image.Image): + colors[i] = torch.tensor( + np.array( + colors[i]) / 255.0, + device=self.device).float() + if weights is None: + weights = [1.0 for _ in range(colors)] + textures = [] + cos_maps = [] + for color, elev, azim, weight in zip(colors, elevs, azims, weights): + texture, cos_map, _ = self.back_project( + color, elev, azim, camera_distance, center) + cos_map = weight * (cos_map ** exp) + textures.append(texture) + cos_maps.append(cos_map) + + texture_merge, trust_map_merge = self.fast_bake_texture( + textures, cos_maps) + return texture_merge, trust_map_merge + + @torch.no_grad() + def fast_bake_texture(self, textures, cos_maps): + + channel = textures[0].shape[-1] + texture_merge = torch.zeros( + self.texture_size + (channel,)).to(self.device) + trust_map_merge = torch.zeros(self.texture_size + (1,)).to(self.device) + for texture, cos_map in zip(textures, cos_maps): + view_sum = (cos_map > 0).sum() + painted_sum = ((cos_map > 0) * (trust_map_merge > 0)).sum() + if painted_sum / view_sum > 0.99: + continue + texture_merge += texture * cos_map + trust_map_merge += cos_map + texture_merge = texture_merge / torch.clamp(trust_map_merge, min=1E-8) + + return texture_merge, trust_map_merge > 1E-8 + + def uv_inpaint(self, texture, mask): + + if isinstance(texture, torch.Tensor): + texture_np = texture.cpu().numpy() + elif isinstance(texture, np.ndarray): + texture_np = texture + elif isinstance(texture, Image.Image): + texture_np = np.array(texture) / 255.0 + + vtx_pos, pos_idx, vtx_uv, uv_idx = self.get_mesh() + + texture_np, mask = meshVerticeInpaint( + texture_np, mask, vtx_pos, vtx_uv, pos_idx, uv_idx) + + texture_np = cv2.inpaint( + (texture_np * + 255).astype( + np.uint8), + 255 - + mask, + 3, + cv2.INPAINT_NS) + + return texture_np diff --git a/hy3dgen/texgen/differentiable_renderer/mesh_utils.py b/hy3dgen/texgen/differentiable_renderer/mesh_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..ca0ba1a6145c68651ec033b97e80900cd2c9d7ec --- /dev/null +++ b/hy3dgen/texgen/differentiable_renderer/mesh_utils.py @@ -0,0 +1,44 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import trimesh + + +def load_mesh(mesh): + vtx_pos = mesh.vertices if hasattr(mesh, 'vertices') else None + pos_idx = mesh.faces if hasattr(mesh, 'faces') else None + + vtx_uv = mesh.visual.uv if hasattr(mesh.visual, 'uv') else None + uv_idx = mesh.faces if hasattr(mesh, 'faces') else None + + texture_data = None + + return vtx_pos, pos_idx, vtx_uv, uv_idx, texture_data + + +def save_mesh(mesh, texture_data): + material = trimesh.visual.texture.SimpleMaterial(image=texture_data, diffuse=(255, 255, 255)) + texture_visuals = trimesh.visual.TextureVisuals(uv=mesh.visual.uv, image=texture_data, material=material) + mesh.visual = texture_visuals + return mesh diff --git a/hy3dgen/texgen/differentiable_renderer/setup.py b/hy3dgen/texgen/differentiable_renderer/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..2ea78693fe96ac027742bd752238421c6d83f8fc --- /dev/null +++ b/hy3dgen/texgen/differentiable_renderer/setup.py @@ -0,0 +1,48 @@ +from setuptools import setup, Extension +import pybind11 +import sys +import platform + +def get_platform_specific_args(): + system = platform.system().lower() + cpp_std = 'c++14' # Make configurable if needed + + if sys.platform == 'win32': + compile_args = ['/O2', f'/std:{cpp_std}', '/EHsc', '/MP', '/DWIN32_LEAN_AND_MEAN', '/bigobj'] + link_args = [] + extra_includes = [] + elif system == 'linux': + compile_args = ['-O3', f'-std={cpp_std}', '-fPIC', '-Wall', '-Wextra', '-pthread'] + link_args = ['-fPIC', '-pthread'] + extra_includes = [] + elif sys.platform == 'darwin': + compile_args = ['-O3', f'-std={cpp_std}', '-fPIC', '-Wall', '-Wextra', + '-stdlib=libc++', '-mmacosx-version-min=10.14'] + link_args = ['-fPIC', '-stdlib=libc++', '-mmacosx-version-min=10.14', '-dynamiclib'] + extra_includes = [] + else: + raise RuntimeError(f"Unsupported platform: {system}") + + return compile_args, link_args, extra_includes + +extra_compile_args, extra_link_args, platform_includes = get_platform_specific_args() +include_dirs = [pybind11.get_include(), pybind11.get_include(user=True)] +include_dirs.extend(platform_includes) + +ext_modules = [ + Extension( + "mesh_processor", + ["mesh_processor.cpp"], + include_dirs=include_dirs, + language='c++', + extra_compile_args=extra_compile_args, + extra_link_args=extra_link_args, + ), +] + +setup( + name="mesh_processor", + ext_modules=ext_modules, + install_requires=['pybind11>=2.6.0'], + python_requires='>=3.6', +) \ No newline at end of file diff --git a/hy3dgen/texgen/hunyuanpaint/__init__.py b/hy3dgen/texgen/hunyuanpaint/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e307c3f8c1292da02f308e4b59ef0bcd6fe7305e --- /dev/null +++ b/hy3dgen/texgen/hunyuanpaint/__init__.py @@ -0,0 +1,23 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. diff --git a/hy3dgen/texgen/hunyuanpaint/pipeline.py b/hy3dgen/texgen/hunyuanpaint/pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..436ce34efb8bc40c3df2b3902b7a29dffa39ae91 --- /dev/null +++ b/hy3dgen/texgen/hunyuanpaint/pipeline.py @@ -0,0 +1,554 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +from typing import Any, Callable, Dict, List, Optional, Union + +import numpy +import numpy as np +import torch +import torch.distributed +import torch.utils.checkpoint +from PIL import Image +from diffusers import ( + AutoencoderKL, + DiffusionPipeline, + ImagePipelineOutput +) +from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback +from diffusers.image_processor import PipelineImageInput +from diffusers.image_processor import VaeImageProcessor +from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput +from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipeline, retrieve_timesteps, \ + rescale_noise_cfg +from diffusers.schedulers import KarrasDiffusionSchedulers +from diffusers.utils import deprecate +from einops import rearrange +from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer + +from .unet.modules import UNet2p5DConditionModel + + +def to_rgb_image(maybe_rgba: Image.Image): + if maybe_rgba.mode == 'RGB': + return maybe_rgba + elif maybe_rgba.mode == 'RGBA': + rgba = maybe_rgba + img = numpy.random.randint(127, 128, size=[rgba.size[1], rgba.size[0], 3], dtype=numpy.uint8) + img = Image.fromarray(img, 'RGB') + img.paste(rgba, mask=rgba.getchannel('A')) + return img + else: + raise ValueError("Unsupported image type.", maybe_rgba.mode) + + +class HunyuanPaintPipeline(StableDiffusionPipeline): + + def __init__( + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2p5DConditionModel, + scheduler: KarrasDiffusionSchedulers, + feature_extractor: CLIPImageProcessor, + safety_checker=None, + use_torch_compile=False, + ): + DiffusionPipeline.__init__(self) + + safety_checker = None + self.register_modules( + vae=torch.compile(vae) if use_torch_compile else vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + unet=unet, + scheduler=scheduler, + safety_checker=safety_checker, + feature_extractor=torch.compile(feature_extractor) if use_torch_compile else feature_extractor, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + + @torch.no_grad() + def encode_images(self, images): + B = images.shape[0] + images = rearrange(images, 'b n c h w -> (b n) c h w') + + dtype = next(self.vae.parameters()).dtype + images = (images - 0.5) * 2.0 + posterior = self.vae.encode(images.to(dtype)).latent_dist + latents = posterior.sample() * self.vae.config.scaling_factor + + latents = rearrange(latents, '(b n) c h w -> b n c h w', b=B) + return latents + + @torch.no_grad() + def __call__( + self, + image: Image.Image = None, + prompt=None, + negative_prompt='watermark, ugly, deformed, noisy, blurry, low contrast', + *args, + num_images_per_prompt: Optional[int] = 1, + guidance_scale=2.0, + output_type: Optional[str] = "pil", + width=512, + height=512, + num_inference_steps=28, + return_dict=True, + **cached_condition, + ): + if image is None: + raise ValueError("Inputting embeddings not supported for this pipeline. Please pass an image.") + assert not isinstance(image, torch.Tensor) + + image = to_rgb_image(image) + + image_vae = torch.tensor(np.array(image) / 255.0) + image_vae = image_vae.unsqueeze(0).permute(0, 3, 1, 2).unsqueeze(0) + image_vae = image_vae.to(device=self.vae.device, dtype=self.vae.dtype) + + batch_size = image_vae.shape[0] + assert batch_size == 1 + assert num_images_per_prompt == 1 + + ref_latents = self.encode_images(image_vae) + + def convert_pil_list_to_tensor(images): + bg_c = [1., 1., 1.] + images_tensor = [] + for batch_imgs in images: + view_imgs = [] + for pil_img in batch_imgs: + img = numpy.asarray(pil_img, dtype=numpy.float32) / 255. + if img.shape[2] > 3: + alpha = img[:, :, 3:] + img = img[:, :, :3] * alpha + bg_c * (1 - alpha) + img = torch.from_numpy(img).permute(2, 0, 1).unsqueeze(0).contiguous().half().to("cuda") + view_imgs.append(img) + view_imgs = torch.cat(view_imgs, dim=0) + images_tensor.append(view_imgs.unsqueeze(0)) + + images_tensor = torch.cat(images_tensor, dim=0) + return images_tensor + + if "normal_imgs" in cached_condition: + + if isinstance(cached_condition["normal_imgs"], List): + cached_condition["normal_imgs"] = convert_pil_list_to_tensor(cached_condition["normal_imgs"]) + + cached_condition['normal_imgs'] = self.encode_images(cached_condition["normal_imgs"]) + + if "position_imgs" in cached_condition: + + if isinstance(cached_condition["position_imgs"], List): + cached_condition["position_imgs"] = convert_pil_list_to_tensor(cached_condition["position_imgs"]) + + cached_condition["position_imgs"] = self.encode_images(cached_condition["position_imgs"]) + + if 'camera_info_gen' in cached_condition: + camera_info = cached_condition['camera_info_gen'] # B,N + if isinstance(camera_info, List): + camera_info = torch.tensor(camera_info) + camera_info = camera_info.to(image_vae.device).to(torch.int64) + cached_condition['camera_info_gen'] = camera_info + if 'camera_info_ref' in cached_condition: + camera_info = cached_condition['camera_info_ref'] # B,N + if isinstance(camera_info, List): + camera_info = torch.tensor(camera_info) + camera_info = camera_info.to(image_vae.device).to(torch.int64) + cached_condition['camera_info_ref'] = camera_info + + cached_condition['ref_latents'] = ref_latents + + if guidance_scale > 1: + negative_ref_latents = torch.zeros_like(cached_condition['ref_latents']) + cached_condition['ref_latents'] = torch.cat([negative_ref_latents, cached_condition['ref_latents']]) + cached_condition['ref_scale'] = torch.as_tensor([0.0, 1.0]).to(cached_condition['ref_latents']) + if "normal_imgs" in cached_condition: + cached_condition['normal_imgs'] = torch.cat( + (cached_condition['normal_imgs'], cached_condition['normal_imgs'])) + + if "position_imgs" in cached_condition: + cached_condition['position_imgs'] = torch.cat( + (cached_condition['position_imgs'], cached_condition['position_imgs'])) + + if 'position_maps' in cached_condition: + cached_condition['position_maps'] = torch.cat( + (cached_condition['position_maps'], cached_condition['position_maps'])) + + if 'camera_info_gen' in cached_condition: + cached_condition['camera_info_gen'] = torch.cat( + (cached_condition['camera_info_gen'], cached_condition['camera_info_gen'])) + if 'camera_info_ref' in cached_condition: + cached_condition['camera_info_ref'] = torch.cat( + (cached_condition['camera_info_ref'], cached_condition['camera_info_ref'])) + + prompt_embeds = self.unet.learned_text_clip_gen.repeat(num_images_per_prompt, 1, 1) + negative_prompt_embeds = torch.zeros_like(prompt_embeds) + + latents: torch.Tensor = self.denoise( + None, + *args, + cross_attention_kwargs=None, + guidance_scale=guidance_scale, + num_images_per_prompt=num_images_per_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + num_inference_steps=num_inference_steps, + output_type='latent', + width=width, + height=height, + **cached_condition + ).images + + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0] + else: + image = latents + + image = self.image_processor.postprocess(image, output_type=output_type) + if not return_dict: + return (image,) + + return ImagePipelineOutput(images=image) + + def denoise( + self, + prompt: Union[str, List[str]] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + timesteps: List[int] = None, + sigmas: List[float] = None, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + guidance_rescale: float = 0.0, + clip_skip: Optional[int] = None, + callback_on_step_end: Optional[ + Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks] + ] = None, + callback_on_step_end_tensor_inputs: List[str] = ["latents"], + **kwargs, + ): + r""" + The call function to the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. + height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): + The width in pixels of the generated image. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + timesteps (`List[int]`, *optional*): + Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument + in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is + passed will be used. Must be in descending order. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. + guidance_scale (`float`, *optional*, defaults to 7.5): + A higher guidance scale value encourages the model to generate images closely linked to the text + `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide what to not include in image generation. If not defined, you need to + pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies + to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make + generation deterministic. + latents (`torch.Tensor`, *optional*): + Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor is generated by sampling using the supplied random `generator`. + prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not + provided, text embeddings are generated from the `prompt` input argument. + negative_prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If + not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. + ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generated image. Choose between `PIL.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + plain tuple. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in + [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + guidance_rescale (`float`, *optional*, defaults to 0.0): + Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are + Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when + using zero terminal SNR. + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. + callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*): + A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of + each denoising step during the inference. with the following arguments: `callback_on_step_end(self: + DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a + list of all tensors as specified by `callback_on_step_end_tensor_inputs`. + callback_on_step_end_tensor_inputs (`List`, *optional*): + The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list + will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the + `._callback_tensor_inputs` attribute of your pipeline class. + + Examples: + + Returns: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: + If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned, + otherwise a `tuple` is returned where the first element is a list with the generated images and the + second element is a list of `bool`s indicating whether the corresponding generated image contains + "not-safe-for-work" (nsfw) content. + """ + + callback = kwargs.pop("callback", None) + callback_steps = kwargs.pop("callback_steps", None) + + if callback is not None: + deprecate( + "callback", + "1.0.0", + "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`", + ) + if callback_steps is not None: + deprecate( + "callback_steps", + "1.0.0", + "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`", + ) + + if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): + callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs + + # 0. Default height and width to unet + height = height or self.unet.config.sample_size * self.vae_scale_factor + width = width or self.unet.config.sample_size * self.vae_scale_factor + # to deal with lora scaling and other possible forward hooks + + # 1. Check inputs. Raise error if not correct + self.check_inputs( + prompt, + height, + width, + callback_steps, + negative_prompt, + prompt_embeds, + negative_prompt_embeds, + ip_adapter_image, + ip_adapter_image_embeds, + callback_on_step_end_tensor_inputs, + ) + + self._guidance_scale = guidance_scale + self._guidance_rescale = guidance_rescale + self._clip_skip = clip_skip + self._cross_attention_kwargs = cross_attention_kwargs + self._interrupt = False + + # 2. Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + device = self._execution_device + + # 3. Encode input prompt + lora_scale = ( + self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None + ) + + prompt_embeds, negative_prompt_embeds = self.encode_prompt( + prompt, + device, + num_images_per_prompt, + self.do_classifier_free_guidance, + negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + lora_scale=lora_scale, + clip_skip=self.clip_skip, + ) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + if self.do_classifier_free_guidance: + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: + image_embeds = self.prepare_ip_adapter_image_embeds( + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_images_per_prompt, + self.do_classifier_free_guidance, + ) + + # 4. Prepare timesteps + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, num_inference_steps, device, timesteps, sigmas + ) + assert num_images_per_prompt == 1 + # 5. Prepare latent variables + num_channels_latents = self.unet.config.in_channels + latents = self.prepare_latents( + batch_size * kwargs['num_in_batch'], # num_images_per_prompt, + num_channels_latents, + height, + width, + prompt_embeds.dtype, + device, + generator, + latents, + ) + + # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + # 6.1 Add image embeds for IP-Adapter + added_cond_kwargs = ( + {"image_embeds": image_embeds} + if (ip_adapter_image is not None or ip_adapter_image_embeds is not None) + else None + ) + + # 6.2 Optionally get Guidance Scale Embedding + timestep_cond = None + if self.unet.config.time_cond_proj_dim is not None: + guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt) + timestep_cond = self.get_guidance_scale_embedding( + guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim + ).to(device=device, dtype=latents.dtype) + + # 7. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + self._num_timesteps = len(timesteps) + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + if self.interrupt: + continue + + # expand the latents if we are doing classifier free guidance + latents = rearrange(latents, '(b n) c h w -> b n c h w', n=kwargs['num_in_batch']) + latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents + latent_model_input = rearrange(latent_model_input, 'b n c h w -> (b n) c h w') + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + latent_model_input = rearrange(latent_model_input, '(b n) c h w ->b n c h w', n=kwargs['num_in_batch']) + + # predict the noise residual + + noise_pred = self.unet( + latent_model_input, + t, + encoder_hidden_states=prompt_embeds, + timestep_cond=timestep_cond, + cross_attention_kwargs=self.cross_attention_kwargs, + added_cond_kwargs=added_cond_kwargs, + return_dict=False, **kwargs + )[0] + latents = rearrange(latents, 'b n c h w -> (b n) c h w') + # perform guidance + if self.do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond) + + if self.do_classifier_free_guidance and self.guidance_rescale > 0.0: + # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf + noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale) + + # compute the previous noisy sample x_t -> x_t-1 + latents = \ + self.scheduler.step(noise_pred, t, latents[:, :num_channels_latents, :, :], **extra_step_kwargs, + return_dict=False)[0] + + if callback_on_step_end is not None: + callback_kwargs = {} + for k in callback_on_step_end_tensor_inputs: + callback_kwargs[k] = locals()[k] + callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) + + latents = callback_outputs.pop("latents", latents) + prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) + negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds) + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + step_idx = i // getattr(self.scheduler, "order", 1) + callback(step_idx, t, latents) + + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[ + 0 + ] + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + else: + image = latents + has_nsfw_concept = None + + if has_nsfw_concept is None: + do_denormalize = [True] * image.shape[0] + else: + do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] + + image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) + + # Offload all models + self.maybe_free_model_hooks() + + if not return_dict: + return (image, has_nsfw_concept) + + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/hy3dgen/texgen/hunyuanpaint/unet/__init__.py b/hy3dgen/texgen/hunyuanpaint/unet/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e307c3f8c1292da02f308e4b59ef0bcd6fe7305e --- /dev/null +++ b/hy3dgen/texgen/hunyuanpaint/unet/__init__.py @@ -0,0 +1,23 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. diff --git a/hy3dgen/texgen/hunyuanpaint/unet/modules.py b/hy3dgen/texgen/hunyuanpaint/unet/modules.py new file mode 100644 index 0000000000000000000000000000000000000000..5d16bc6b6bb1ebc72c602dcb298d122429fe847d --- /dev/null +++ b/hy3dgen/texgen/hunyuanpaint/unet/modules.py @@ -0,0 +1,440 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + + +import copy +import json +import os +from typing import Any, Dict, Optional + +import torch +import torch.nn as nn +from diffusers.models import UNet2DConditionModel +from diffusers.models.attention_processor import Attention +from diffusers.models.transformers.transformer_2d import BasicTransformerBlock +from einops import rearrange + + +def _chunked_feed_forward(ff: nn.Module, hidden_states: torch.Tensor, chunk_dim: int, chunk_size: int): + # "feed_forward_chunk_size" can be used to save memory + if hidden_states.shape[chunk_dim] % chunk_size != 0: + raise ValueError( + f"`hidden_states` dimension to be chunked: {hidden_states.shape[chunk_dim]} has to be divisible by chunk size: {chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`." + ) + + num_chunks = hidden_states.shape[chunk_dim] // chunk_size + ff_output = torch.cat( + [ff(hid_slice) for hid_slice in hidden_states.chunk(num_chunks, dim=chunk_dim)], + dim=chunk_dim, + ) + return ff_output + + +class Basic2p5DTransformerBlock(torch.nn.Module): + def __init__(self, transformer: BasicTransformerBlock, layer_name, use_ma=True, use_ra=True) -> None: + super().__init__() + self.transformer = transformer + self.layer_name = layer_name + self.use_ma = use_ma + self.use_ra = use_ra + + # multiview attn + if self.use_ma: + self.attn_multiview = Attention( + query_dim=self.dim, + heads=self.num_attention_heads, + dim_head=self.attention_head_dim, + dropout=self.dropout, + bias=self.attention_bias, + cross_attention_dim=None, + upcast_attention=self.attn1.upcast_attention, + out_bias=True, + ) + + # ref attn + if self.use_ra: + self.attn_refview = Attention( + query_dim=self.dim, + heads=self.num_attention_heads, + dim_head=self.attention_head_dim, + dropout=self.dropout, + bias=self.attention_bias, + cross_attention_dim=None, + upcast_attention=self.attn1.upcast_attention, + out_bias=True, + ) + + def __getattr__(self, name: str): + try: + return super().__getattr__(name) + except AttributeError: + return getattr(self.transformer, name) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + timestep: Optional[torch.LongTensor] = None, + cross_attention_kwargs: Dict[str, Any] = None, + class_labels: Optional[torch.LongTensor] = None, + added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None, + ) -> torch.Tensor: + + # Notice that normalization is always applied before the real computation in the following blocks. + # 0. Self-Attention + batch_size = hidden_states.shape[0] + + cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {} + num_in_batch = cross_attention_kwargs.pop('num_in_batch', 1) + mode = cross_attention_kwargs.pop('mode', None) + mva_scale = cross_attention_kwargs.pop('mva_scale', 1.0) + ref_scale = cross_attention_kwargs.pop('ref_scale', 1.0) + condition_embed_dict = cross_attention_kwargs.pop("condition_embed_dict", None) + + if self.norm_type == "ada_norm": + norm_hidden_states = self.norm1(hidden_states, timestep) + elif self.norm_type == "ada_norm_zero": + norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1( + hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype + ) + elif self.norm_type in ["layer_norm", "layer_norm_i2vgen"]: + norm_hidden_states = self.norm1(hidden_states) + elif self.norm_type == "ada_norm_continuous": + norm_hidden_states = self.norm1(hidden_states, added_cond_kwargs["pooled_text_emb"]) + elif self.norm_type == "ada_norm_single": + shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = ( + self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1) + ).chunk(6, dim=1) + norm_hidden_states = self.norm1(hidden_states) + norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa + else: + raise ValueError("Incorrect norm used") + + if self.pos_embed is not None: + norm_hidden_states = self.pos_embed(norm_hidden_states) + + # 1. Prepare GLIGEN inputs + cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {} + gligen_kwargs = cross_attention_kwargs.pop("gligen", None) + + attn_output = self.attn1( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None, + attention_mask=attention_mask, + **cross_attention_kwargs, + ) + + if self.norm_type == "ada_norm_zero": + attn_output = gate_msa.unsqueeze(1) * attn_output + elif self.norm_type == "ada_norm_single": + attn_output = gate_msa * attn_output + + hidden_states = attn_output + hidden_states + if hidden_states.ndim == 4: + hidden_states = hidden_states.squeeze(1) + + # 1.2 Reference Attention + if 'w' in mode: + condition_embed_dict[self.layer_name] = rearrange(norm_hidden_states, '(b n) l c -> b (n l) c', + n=num_in_batch) # B, (N L), C + + if 'r' in mode and self.use_ra: + condition_embed = condition_embed_dict[self.layer_name].unsqueeze(1).repeat(1, num_in_batch, 1, + 1) # B N L C + condition_embed = rearrange(condition_embed, 'b n l c -> (b n) l c') + + attn_output = self.attn_refview( + norm_hidden_states, + encoder_hidden_states=condition_embed, + attention_mask=None, + **cross_attention_kwargs + ) + ref_scale_timing = ref_scale + if isinstance(ref_scale, torch.Tensor): + ref_scale_timing = ref_scale.unsqueeze(1).repeat(1, num_in_batch).view(-1) + for _ in range(attn_output.ndim - 1): + ref_scale_timing = ref_scale_timing.unsqueeze(-1) + hidden_states = ref_scale_timing * attn_output + hidden_states + if hidden_states.ndim == 4: + hidden_states = hidden_states.squeeze(1) + + # 1.3 Multiview Attention + if num_in_batch > 1 and self.use_ma: + multivew_hidden_states = rearrange(norm_hidden_states, '(b n) l c -> b (n l) c', n=num_in_batch) + + attn_output = self.attn_multiview( + multivew_hidden_states, + encoder_hidden_states=multivew_hidden_states, + **cross_attention_kwargs + ) + + attn_output = rearrange(attn_output, 'b (n l) c -> (b n) l c', n=num_in_batch) + + hidden_states = mva_scale * attn_output + hidden_states + if hidden_states.ndim == 4: + hidden_states = hidden_states.squeeze(1) + + # 1.2 GLIGEN Control + if gligen_kwargs is not None: + hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"]) + + # 3. Cross-Attention + if self.attn2 is not None: + if self.norm_type == "ada_norm": + norm_hidden_states = self.norm2(hidden_states, timestep) + elif self.norm_type in ["ada_norm_zero", "layer_norm", "layer_norm_i2vgen"]: + norm_hidden_states = self.norm2(hidden_states) + elif self.norm_type == "ada_norm_single": + # For PixArt norm2 isn't applied here: + # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103 + norm_hidden_states = hidden_states + elif self.norm_type == "ada_norm_continuous": + norm_hidden_states = self.norm2(hidden_states, added_cond_kwargs["pooled_text_emb"]) + else: + raise ValueError("Incorrect norm") + + if self.pos_embed is not None and self.norm_type != "ada_norm_single": + norm_hidden_states = self.pos_embed(norm_hidden_states) + + attn_output = self.attn2( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + **cross_attention_kwargs, + ) + + hidden_states = attn_output + hidden_states + + # 4. Feed-forward + # i2vgen doesn't have this norm 🤷‍♂️ + if self.norm_type == "ada_norm_continuous": + norm_hidden_states = self.norm3(hidden_states, added_cond_kwargs["pooled_text_emb"]) + elif not self.norm_type == "ada_norm_single": + norm_hidden_states = self.norm3(hidden_states) + + if self.norm_type == "ada_norm_zero": + norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None] + + if self.norm_type == "ada_norm_single": + norm_hidden_states = self.norm2(hidden_states) + norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp + + if self._chunk_size is not None: + # "feed_forward_chunk_size" can be used to save memory + ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size) + else: + ff_output = self.ff(norm_hidden_states) + + if self.norm_type == "ada_norm_zero": + ff_output = gate_mlp.unsqueeze(1) * ff_output + elif self.norm_type == "ada_norm_single": + ff_output = gate_mlp * ff_output + + hidden_states = ff_output + hidden_states + if hidden_states.ndim == 4: + hidden_states = hidden_states.squeeze(1) + + return hidden_states + + +class UNet2p5DConditionModel(torch.nn.Module): + def __init__(self, unet: UNet2DConditionModel) -> None: + super().__init__() + self.unet = unet + + self.use_ma = True + self.use_ra = True + self.use_camera_embedding = True + self.use_dual_stream = True + + if self.use_dual_stream: + self.unet_dual = copy.deepcopy(unet) + self.init_attention(self.unet_dual) + self.init_attention(self.unet, use_ma=self.use_ma, use_ra=self.use_ra) + self.init_condition() + self.init_camera_embedding() + + @staticmethod + def from_pretrained(pretrained_model_name_or_path, **kwargs): + torch_dtype = kwargs.pop('torch_dtype', torch.float32) + config_path = os.path.join(pretrained_model_name_or_path, 'config.json') + unet_ckpt_path = os.path.join(pretrained_model_name_or_path, 'diffusion_pytorch_model.bin') + with open(config_path, 'r', encoding='utf-8') as file: + config = json.load(file) + unet = UNet2DConditionModel(**config) + unet = UNet2p5DConditionModel(unet) + unet_ckpt = torch.load(unet_ckpt_path, map_location='cpu', weights_only=True) + unet.load_state_dict(unet_ckpt, strict=True) + unet = unet.to(torch_dtype) + return unet + + def init_condition(self): + self.unet.conv_in = torch.nn.Conv2d( + 12, + self.unet.conv_in.out_channels, + kernel_size=self.unet.conv_in.kernel_size, + stride=self.unet.conv_in.stride, + padding=self.unet.conv_in.padding, + dilation=self.unet.conv_in.dilation, + groups=self.unet.conv_in.groups, + bias=self.unet.conv_in.bias is not None) + + self.unet.learned_text_clip_gen = nn.Parameter(torch.randn(1, 77, 1024)) + self.unet.learned_text_clip_ref = nn.Parameter(torch.randn(1, 77, 1024)) + + def init_camera_embedding(self): + + if self.use_camera_embedding: + time_embed_dim = 1280 + self.max_num_ref_image = 5 + self.max_num_gen_image = 12 * 3 + 4 * 2 + self.unet.class_embedding = nn.Embedding(self.max_num_ref_image + self.max_num_gen_image, time_embed_dim) + + def init_attention(self, unet, use_ma=False, use_ra=False): + + for down_block_i, down_block in enumerate(unet.down_blocks): + if hasattr(down_block, "has_cross_attention") and down_block.has_cross_attention: + for attn_i, attn in enumerate(down_block.attentions): + for transformer_i, transformer in enumerate(attn.transformer_blocks): + if isinstance(transformer, BasicTransformerBlock): + attn.transformer_blocks[transformer_i] = Basic2p5DTransformerBlock(transformer, + f'down_{down_block_i}_{attn_i}_{transformer_i}', + use_ma, use_ra) + + if hasattr(unet.mid_block, "has_cross_attention") and unet.mid_block.has_cross_attention: + for attn_i, attn in enumerate(unet.mid_block.attentions): + for transformer_i, transformer in enumerate(attn.transformer_blocks): + if isinstance(transformer, BasicTransformerBlock): + attn.transformer_blocks[transformer_i] = Basic2p5DTransformerBlock(transformer, + f'mid_{attn_i}_{transformer_i}', + use_ma, use_ra) + + for up_block_i, up_block in enumerate(unet.up_blocks): + if hasattr(up_block, "has_cross_attention") and up_block.has_cross_attention: + for attn_i, attn in enumerate(up_block.attentions): + for transformer_i, transformer in enumerate(attn.transformer_blocks): + if isinstance(transformer, BasicTransformerBlock): + attn.transformer_blocks[transformer_i] = Basic2p5DTransformerBlock(transformer, + f'up_{up_block_i}_{attn_i}_{transformer_i}', + use_ma, use_ra) + + def __getattr__(self, name: str): + try: + return super().__getattr__(name) + except AttributeError: + return getattr(self.unet, name) + + def forward( + self, sample, timestep, encoder_hidden_states, + *args, down_intrablock_additional_residuals=None, + down_block_res_samples=None, mid_block_res_sample=None, + **cached_condition, + ): + B, N_gen, _, H, W = sample.shape + assert H == W + + if self.use_camera_embedding: + camera_info_gen = cached_condition['camera_info_gen'] + self.max_num_ref_image + camera_info_gen = rearrange(camera_info_gen, 'b n -> (b n)') + else: + camera_info_gen = None + + sample = [sample] + if 'normal_imgs' in cached_condition: + sample.append(cached_condition["normal_imgs"]) + if 'position_imgs' in cached_condition: + sample.append(cached_condition["position_imgs"]) + sample = torch.cat(sample, dim=2) + + sample = rearrange(sample, 'b n c h w -> (b n) c h w') + + encoder_hidden_states_gen = encoder_hidden_states.unsqueeze(1).repeat(1, N_gen, 1, 1) + encoder_hidden_states_gen = rearrange(encoder_hidden_states_gen, 'b n l c -> (b n) l c') + + if self.use_ra: + if 'condition_embed_dict' in cached_condition: + condition_embed_dict = cached_condition['condition_embed_dict'] + else: + condition_embed_dict = {} + ref_latents = cached_condition['ref_latents'] + N_ref = ref_latents.shape[1] + if self.use_camera_embedding: + camera_info_ref = cached_condition['camera_info_ref'] + camera_info_ref = rearrange(camera_info_ref, 'b n -> (b n)') + else: + camera_info_ref = None + + ref_latents = rearrange(ref_latents, 'b n c h w -> (b n) c h w') + + encoder_hidden_states_ref = self.unet.learned_text_clip_ref.unsqueeze(1).repeat(B, N_ref, 1, 1) + encoder_hidden_states_ref = rearrange(encoder_hidden_states_ref, 'b n l c -> (b n) l c') + + noisy_ref_latents = ref_latents + timestep_ref = 0 + + if self.use_dual_stream: + unet_ref = self.unet_dual + else: + unet_ref = self.unet + unet_ref( + noisy_ref_latents, timestep_ref, + encoder_hidden_states=encoder_hidden_states_ref, + class_labels=camera_info_ref, + # **kwargs + return_dict=False, + cross_attention_kwargs={ + 'mode': 'w', 'num_in_batch': N_ref, + 'condition_embed_dict': condition_embed_dict}, + ) + cached_condition['condition_embed_dict'] = condition_embed_dict + else: + condition_embed_dict = None + + mva_scale = cached_condition.get('mva_scale', 1.0) + ref_scale = cached_condition.get('ref_scale', 1.0) + + return self.unet( + sample, timestep, + encoder_hidden_states_gen, *args, + class_labels=camera_info_gen, + down_intrablock_additional_residuals=[ + sample.to(dtype=self.unet.dtype) for sample in down_intrablock_additional_residuals + ] if down_intrablock_additional_residuals is not None else None, + down_block_additional_residuals=[ + sample.to(dtype=self.unet.dtype) for sample in down_block_res_samples + ] if down_block_res_samples is not None else None, + mid_block_additional_residual=( + mid_block_res_sample.to(dtype=self.unet.dtype) + if mid_block_res_sample is not None else None + ), + return_dict=False, + cross_attention_kwargs={ + 'mode': 'r', 'num_in_batch': N_gen, + 'condition_embed_dict': condition_embed_dict, + 'mva_scale': mva_scale, + 'ref_scale': ref_scale, + }, + ) diff --git a/hy3dgen/texgen/pipelines.py b/hy3dgen/texgen/pipelines.py new file mode 100644 index 0000000000000000000000000000000000000000..ebb95ea41ed0e39b3b48d1fe30ad22e4dd5f4a41 --- /dev/null +++ b/hy3dgen/texgen/pipelines.py @@ -0,0 +1,227 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + + +import logging +import os + +import numpy as np +import torch +from PIL import Image + +from .differentiable_renderer.mesh_render import MeshRender +from .utils.dehighlight_utils import Light_Shadow_Remover +from .utils.multiview_utils import Multiview_Diffusion_Net +from .utils.uv_warp_utils import mesh_uv_wrap + +logger = logging.getLogger(__name__) + + +class Hunyuan3DTexGenConfig: + + def __init__(self, light_remover_ckpt_path, multiview_ckpt_path): + self.device = 'cpu' + self.light_remover_ckpt_path = light_remover_ckpt_path + self.multiview_ckpt_path = multiview_ckpt_path + + self.candidate_camera_azims = [0, 90, 180, 270, 0, 180] + self.candidate_camera_elevs = [0, 0, 0, 0, 90, -90] + self.candidate_view_weights = [1, 0.1, 0.5, 0.1, 0.05, 0.05] + + self.render_size = 2048 + self.texture_size = 1024 + self.bake_exp = 4 + self.merge_method = 'fast' + + +class Hunyuan3DPaintPipeline: + @classmethod + def from_pretrained(cls, model_path): + original_model_path = model_path + if not os.path.exists(model_path): + # try local path + base_dir = os.environ.get('HY3DGEN_MODELS', '~/.cache/hy3dgen') + model_path = os.path.expanduser(os.path.join(base_dir, model_path)) + + delight_model_path = os.path.join(model_path, 'hunyuan3d-delight-v2-0') + multiview_model_path = os.path.join(model_path, 'hunyuan3d-paint-v2-0') + + if not os.path.exists(delight_model_path) or not os.path.exists(multiview_model_path): + try: + import huggingface_hub + # download from huggingface + model_path = huggingface_hub.snapshot_download(repo_id=original_model_path) + delight_model_path = os.path.join(model_path, 'hunyuan3d-delight-v2-0') + multiview_model_path = os.path.join(model_path, 'hunyuan3d-paint-v2-0') + return cls(Hunyuan3DTexGenConfig(delight_model_path, multiview_model_path)) + except ImportError: + logger.warning( + "You need to install HuggingFace Hub to load models from the hub." + ) + raise RuntimeError(f"Model path {model_path} not found") + else: + return cls(Hunyuan3DTexGenConfig(delight_model_path, multiview_model_path)) + + raise FileNotFoundError(f"Model path {original_model_path} not found and we could not find it at huggingface") + + def __init__(self, config): + self.config = config + self.models = {} + self.render = MeshRender( + default_resolution=self.config.render_size, + texture_size=self.config.texture_size) + + self.load_models() + + def load_models(self): + # empty cude cache + torch.cuda.empty_cache() + # Load model + self.models['delight_model'] = Light_Shadow_Remover(self.config) + self.models['multiview_model'] = Multiview_Diffusion_Net(self.config) + + def render_normal_multiview(self, camera_elevs, camera_azims, use_abs_coor=True): + normal_maps = [] + for elev, azim in zip(camera_elevs, camera_azims): + normal_map = self.render.render_normal( + elev, azim, use_abs_coor=use_abs_coor, return_type='pl') + normal_maps.append(normal_map) + + return normal_maps + + def render_position_multiview(self, camera_elevs, camera_azims): + position_maps = [] + for elev, azim in zip(camera_elevs, camera_azims): + position_map = self.render.render_position( + elev, azim, return_type='pl') + position_maps.append(position_map) + + return position_maps + + def bake_from_multiview(self, views, camera_elevs, + camera_azims, view_weights, method='graphcut'): + project_textures, project_weighted_cos_maps = [], [] + project_boundary_maps = [] + for view, camera_elev, camera_azim, weight in zip( + views, camera_elevs, camera_azims, view_weights): + project_texture, project_cos_map, project_boundary_map = self.render.back_project( + view, camera_elev, camera_azim) + project_cos_map = weight * (project_cos_map ** self.config.bake_exp) + project_textures.append(project_texture) + project_weighted_cos_maps.append(project_cos_map) + project_boundary_maps.append(project_boundary_map) + + if method == 'fast': + texture, ori_trust_map = self.render.fast_bake_texture( + project_textures, project_weighted_cos_maps) + else: + raise f'no method {method}' + return texture, ori_trust_map > 1E-8 + + def texture_inpaint(self, texture, mask): + + texture_np = self.render.uv_inpaint(texture, mask) + texture = torch.tensor(texture_np / 255).float().to(texture.device) + + return texture + + def recenter_image(self, image, border_ratio=0.2): + if image.mode == 'RGB': + return image + elif image.mode == 'L': + image = image.convert('RGB') + return image + + alpha_channel = np.array(image)[:, :, 3] + non_zero_indices = np.argwhere(alpha_channel > 0) + if non_zero_indices.size == 0: + raise ValueError("Image is fully transparent") + + min_row, min_col = non_zero_indices.min(axis=0) + max_row, max_col = non_zero_indices.max(axis=0) + + cropped_image = image.crop((min_col, min_row, max_col + 1, max_row + 1)) + + width, height = cropped_image.size + border_width = int(width * border_ratio) + border_height = int(height * border_ratio) + + new_width = width + 2 * border_width + new_height = height + 2 * border_height + + square_size = max(new_width, new_height) + + new_image = Image.new('RGBA', (square_size, square_size), (255, 255, 255, 0)) + + paste_x = (square_size - new_width) // 2 + border_width + paste_y = (square_size - new_height) // 2 + border_height + + new_image.paste(cropped_image, (paste_x, paste_y)) + return new_image + + @torch.no_grad() + def __call__(self, mesh, image): + + if isinstance(image, str): + image_prompt = Image.open(image) + else: + image_prompt = image + + image_prompt = self.recenter_image(image_prompt) + + image_prompt = self.models['delight_model'](image_prompt) + + mesh = mesh_uv_wrap(mesh) + + self.render.load_mesh(mesh) + + selected_camera_elevs, selected_camera_azims, selected_view_weights = \ + self.config.candidate_camera_elevs, self.config.candidate_camera_azims, self.config.candidate_view_weights + + normal_maps = self.render_normal_multiview( + selected_camera_elevs, selected_camera_azims, use_abs_coor=True) + position_maps = self.render_position_multiview( + selected_camera_elevs, selected_camera_azims) + + camera_info = [(((azim // 30) + 9) % 12) // {-20: 1, 0: 1, 20: 1, -90: 3, 90: 3}[ + elev] + {-20: 0, 0: 12, 20: 24, -90: 36, 90: 40}[elev] for azim, elev in + zip(selected_camera_azims, selected_camera_elevs)] + multiviews = self.models['multiview_model'](image_prompt, normal_maps + position_maps, camera_info) + + for i in range(len(multiviews)): + multiviews[i] = multiviews[i].resize( + (self.config.render_size, self.config.render_size)) + + texture, mask = self.bake_from_multiview(multiviews, + selected_camera_elevs, selected_camera_azims, selected_view_weights, + method=self.config.merge_method) + + mask_np = (mask.squeeze(-1).cpu().numpy() * 255).astype(np.uint8) + + texture = self.texture_inpaint(texture, mask_np) + + self.render.set_texture(texture) + textured_mesh = self.render.save_mesh() + + return textured_mesh diff --git a/hy3dgen/texgen/utils/__init__.py b/hy3dgen/texgen/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e307c3f8c1292da02f308e4b59ef0bcd6fe7305e --- /dev/null +++ b/hy3dgen/texgen/utils/__init__.py @@ -0,0 +1,23 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. diff --git a/hy3dgen/texgen/utils/alignImg4Tex_utils.py b/hy3dgen/texgen/utils/alignImg4Tex_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..0a09c17cfe1a3f1ac850688e96b66341f0226418 --- /dev/null +++ b/hy3dgen/texgen/utils/alignImg4Tex_utils.py @@ -0,0 +1,132 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + + +import torch +from diffusers import EulerAncestralDiscreteScheduler +from diffusers import StableDiffusionControlNetPipeline, StableDiffusionXLControlNetImg2ImgPipeline, ControlNetModel, \ + AutoencoderKL + + +class Img2img_Control_Ip_adapter: + def __init__(self, device): + controlnet = ControlNetModel.from_pretrained('lllyasviel/control_v11f1p_sd15_depth', torch_dtype=torch.float16, + variant="fp16", use_safetensors=True) + pipe = StableDiffusionControlNetPipeline.from_pretrained( + 'runwayml/stable-diffusion-v1-5', controlnet=controlnet, torch_dtype=torch.float16, use_safetensors=True + ) + pipe.load_ip_adapter('h94/IP-Adapter', subfolder="models", weight_name="ip-adapter-plus_sd15.safetensors") + pipe.set_ip_adapter_scale(0.7) + + pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config) + # pipe.enable_model_cpu_offload() + self.pipe = pipe.to(device) + + def __call__( + self, + prompt, + control_image, + ip_adapter_image, + negative_prompt, + height=512, + width=512, + num_inference_steps=20, + guidance_scale=8.0, + controlnet_conditioning_scale=1.0, + output_type="pil", + **kwargs, + ): + results = self.pipe( + prompt=prompt, + negative_prompt=negative_prompt, + image=control_image, + ip_adapter_image=ip_adapter_image, + generator=torch.manual_seed(42), + seed=42, + num_inference_steps=num_inference_steps, + guidance_scale=guidance_scale, + controlnet_conditioning_scale=controlnet_conditioning_scale, + strength=1, + # clip_skip=2, + height=height, + width=width, + output_type=output_type, + **kwargs, + ).images[0] + return results + + +################################################################ + +class HesModel: + def __init__(self, ): + controlnet_depth = ControlNetModel.from_pretrained( + 'diffusers/controlnet-depth-sdxl-1.0', + torch_dtype=torch.float16, + variant="fp16", + use_safetensors=True + ) + self.pipe = StableDiffusionXLControlNetImg2ImgPipeline.from_pretrained( + 'stabilityai/stable-diffusion-xl-base-1.0', + torch_dtype=torch.float16, + variant="fp16", + controlnet=controlnet_depth, + use_safetensors=True, + ) + self.pipe.vae = AutoencoderKL.from_pretrained( + 'madebyollin/sdxl-vae-fp16-fix', + torch_dtype=torch.float16 + ) + + self.pipe.load_ip_adapter('h94/IP-Adapter', subfolder="sdxl_models", weight_name="ip-adapter_sdxl.safetensors") + self.pipe.set_ip_adapter_scale(0.7) + self.pipe.to("cuda") + + def __call__(self, + init_image, + control_image, + ip_adapter_image=None, + prompt='3D image', + negative_prompt='2D image', + seed=42, + strength=0.8, + num_inference_steps=40, + guidance_scale=7.5, + controlnet_conditioning_scale=0.5, + **kwargs + ): + image = self.pipe( + prompt=prompt, + image=init_image, + control_image=control_image, + ip_adapter_image=ip_adapter_image, + negative_prompt=negative_prompt, + num_inference_steps=num_inference_steps, + guidance_scale=guidance_scale, + strength=strength, + controlnet_conditioning_scale=controlnet_conditioning_scale, + seed=seed, + **kwargs + ).images[0] + return image diff --git a/hy3dgen/texgen/utils/counter_utils.py b/hy3dgen/texgen/utils/counter_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..e0374fc327ad2127ec84bb0c267c19a3b9c8d738 --- /dev/null +++ b/hy3dgen/texgen/utils/counter_utils.py @@ -0,0 +1,58 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + + +class RunningStats(): + def __init__(self) -> None: + self.count = 0 + self.sum = 0 + self.mean = 0 + self.min = None + self.max = None + + def add_value(self, value): + self.count += 1 + self.sum += value + self.mean = self.sum / self.count + + if self.min is None or value < self.min: + self.min = value + + if self.max is None or value > self.max: + self.max = value + + def get_count(self): + return self.count + + def get_sum(self): + return self.sum + + def get_mean(self): + return self.mean + + def get_min(self): + return self.min + + def get_max(self): + return self.max diff --git a/hy3dgen/texgen/utils/dehighlight_utils.py b/hy3dgen/texgen/utils/dehighlight_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..089076b08f712ec0db882835f422183fd7f94457 --- /dev/null +++ b/hy3dgen/texgen/utils/dehighlight_utils.py @@ -0,0 +1,84 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import cv2 +import numpy as np +import torch +from PIL import Image +from diffusers import StableDiffusionInstructPix2PixPipeline, EulerAncestralDiscreteScheduler + + +class Light_Shadow_Remover(): + def __init__(self, config): + self.device = config.device + self.cfg_image = 1.5 + self.cfg_text = 1.0 + + pipeline = StableDiffusionInstructPix2PixPipeline.from_pretrained( + config.light_remover_ckpt_path, + torch_dtype=torch.float16, + safety_checker=None, + ) + pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(pipeline.scheduler.config) + pipeline.set_progress_bar_config(disable=True) + + # self.pipeline = pipeline.to(self.device, torch.float16) + self.pipeline = pipeline # Needed to avoid displaying the warning + @torch.no_grad() + def __call__(self, image): + + image = image.resize((512, 512)) + + if image.mode == 'RGBA': + image_array = np.array(image) + alpha_channel = image_array[:, :, 3] + erosion_size = 3 + kernel = np.ones((erosion_size, erosion_size), np.uint8) + alpha_channel = cv2.erode(alpha_channel, kernel, iterations=1) + image_array[alpha_channel == 0, :3] = 255 + image_array[:, :, 3] = alpha_channel + image = Image.fromarray(image_array) + + image_tensor = torch.tensor(np.array(image) / 255.0).to(self.device) + alpha = image_tensor[:, :, 3:] + rgb_target = image_tensor[:, :, :3] + else: + image_tensor = torch.tensor(np.array(image) / 255.0).to(self.device) + alpha = torch.ones_like(image_tensor)[:, :, :1] + rgb_target = image_tensor[:, :, :3] + + image = image.convert('RGB') + + image = self.pipeline( + prompt="", + image=image, + generator=torch.manual_seed(42), + height=512, + width=512, + num_inference_steps=50, + image_guidance_scale=self.cfg_image, + guidance_scale=self.cfg_text, + ).images[0] + + return image diff --git a/hy3dgen/texgen/utils/multiview_utils.py b/hy3dgen/texgen/utils/multiview_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..ba5708b617e0d58d6d37025fcb94a75324b9e5a9 --- /dev/null +++ b/hy3dgen/texgen/utils/multiview_utils.py @@ -0,0 +1,86 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import os +import random + +import numpy as np +import torch +from diffusers import DiffusionPipeline +from diffusers import EulerAncestralDiscreteScheduler + + +class Multiview_Diffusion_Net(): + def __init__(self, config) -> None: + self.device = config.device + self.view_size = 512 + multiview_ckpt_path = config.multiview_ckpt_path + + current_file_path = os.path.abspath(__file__) + custom_pipeline_path = os.path.join(os.path.dirname(current_file_path), '..', 'hunyuanpaint') + + pipeline = DiffusionPipeline.from_pretrained( + multiview_ckpt_path, + custom_pipeline=custom_pipeline_path, torch_dtype=torch.float16) + + pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(pipeline.scheduler.config, + timestep_spacing='trailing') + + pipeline.set_progress_bar_config(disable=True) + self.pipeline = pipeline #.to(self.device) # only for cosmetics and not display the warning + + def seed_everything(self, seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + os.environ["PL_GLOBAL_SEED"] = str(seed) + + def __call__(self, input_image, control_images, camera_info): + + self.seed_everything(0) + + input_image = input_image.resize((self.view_size, self.view_size)) + for i in range(len(control_images)): + control_images[i] = control_images[i].resize((self.view_size, self.view_size)) + if control_images[i].mode == 'L': + control_images[i] = control_images[i].point(lambda x: 255 if x > 1 else 0, mode='1') + + kwargs = dict(generator=torch.Generator(device=self.pipeline.device).manual_seed(0)) + + num_view = len(control_images) // 2 + normal_image = [[control_images[i] for i in range(num_view)]] + position_image = [[control_images[i + num_view] for i in range(num_view)]] + + camera_info_gen = [camera_info] + camera_info_ref = [[0]] + kwargs['width'] = self.view_size + kwargs['height'] = self.view_size + kwargs['num_in_batch'] = num_view + kwargs['camera_info_gen'] = camera_info_gen + kwargs['camera_info_ref'] = camera_info_ref + kwargs["normal_imgs"] = normal_image + kwargs["position_imgs"] = position_image + + mvd_image = self.pipeline(input_image, num_inference_steps=30, **kwargs).images + return mvd_image diff --git a/hy3dgen/texgen/utils/simplify_mesh_utils.py b/hy3dgen/texgen/utils/simplify_mesh_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..915284d337e648c57fae886dee3333c0203856b6 --- /dev/null +++ b/hy3dgen/texgen/utils/simplify_mesh_utils.py @@ -0,0 +1,46 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import trimesh + + +def remesh_mesh(mesh_path, remesh_path, method='trimesh'): + if method == 'trimesh': + mesh_simplify_trimesh(mesh_path, remesh_path) + else: + raise f'Method {method} has not been implemented.' + + +def mesh_simplify_trimesh(inputpath, outputpath): + import pymeshlab + ms = pymeshlab.MeshSet() + ms.load_new_mesh(inputpath, load_in_a_single_layer=True) + ms.save_current_mesh(outputpath.replace('.glb', '.obj'), save_textures=False) + + courent = trimesh.load(outputpath.replace('.glb', '.obj'), force='mesh') + face_num = courent.faces.shape[0] + + if face_num > 100000: + courent = courent.simplify_quadric_decimation(40000) + courent.export(outputpath) diff --git a/hy3dgen/texgen/utils/uv_warp_utils.py b/hy3dgen/texgen/utils/uv_warp_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..b4f4082274b900aebcdbfcf29a7d6a9532dfa8cb --- /dev/null +++ b/hy3dgen/texgen/utils/uv_warp_utils.py @@ -0,0 +1,42 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import trimesh +import xatlas + + +def mesh_uv_wrap(mesh): + if isinstance(mesh, trimesh.Scene): + mesh = mesh.dump(concatenate=True) + + # if len(mesh.faces) > 50000: + # raise ValueError("The mesh has more than 50,000 faces, which is not supported.") + + vmapping, indices, uvs = xatlas.parametrize(mesh.vertices, mesh.faces) + + mesh.vertices = mesh.vertices[vmapping] + mesh.faces = indices + mesh.visual.uv = uvs + + return mesh diff --git a/hy3dgen/text2image.py b/hy3dgen/text2image.py new file mode 100644 index 0000000000000000000000000000000000000000..be920672cb72238cbe49cba930e3e02a7c287b82 --- /dev/null +++ b/hy3dgen/text2image.py @@ -0,0 +1,93 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + + +import os +import random + +import numpy as np +import torch +from diffusers import AutoPipelineForText2Image + + +def seed_everything(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + os.environ["PL_GLOBAL_SEED"] = str(seed) + + +class HunyuanDiTPipeline: + def __init__( + self, + model_path="Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers-Distilled", + device='cpu' + ): + torch.set_default_device('cpu') + self.device = device + self.pipe = AutoPipelineForText2Image.from_pretrained( + model_path, + torch_dtype=torch.float16, + enable_pag=True, + pag_applied_layers=["blocks.(16|17|18|19)"] + ) # .to(device) # needed to avoid displaying the warning + self.pos_txt = ",白色背景,3D风格,最佳质量" + self.neg_txt = "文本,特写,裁剪,出框,最差质量,低质量,JPEG伪影,PGLY,重复,病态," \ + "残缺,多余的手指,变异的手,画得不好的手,画得不好的脸,变异,畸形,模糊,脱水,糟糕的解剖学," \ + "糟糕的比例,多余的肢体,克隆的脸,毁容,恶心的比例,畸形的肢体,缺失的手臂,缺失的腿," \ + "额外的手臂,额外的腿,融合的手指,手指太多,长脖子" + + def compile(self): + # accelarate hunyuan-dit transformer,first inference will cost long time + torch.set_float32_matmul_precision('high') + self.pipe.transformer = torch.compile(self.pipe.transformer, fullgraph=True) + # self.pipe.vae.decode = torch.compile(self.pipe.vae.decode, fullgraph=True) + generator = torch.Generator(device=self.pipe.device) # infer once for hot-start + out_img = self.pipe( + prompt='美少女战士', + negative_prompt='模糊', + num_inference_steps=25, + pag_scale=1.3, + width=1024, + height=1024, + generator=generator, + return_dict=False + )[0][0] + + @torch.no_grad() + def __call__(self, prompt, seed=0): + seed_everything(seed) + generator = torch.Generator(device="cuda") #self.pipe.device + generator = generator.manual_seed(int(seed)) + out_img = self.pipe( + prompt=self.pos_txt+prompt, + negative_prompt=self.neg_txt, + num_inference_steps=20, + pag_scale=1.3, + width=1024, + height=1024, + generator=generator, + return_dict=False + )[0][0] + return out_img diff --git a/requirements-uv.txt b/requirements-uv.txt new file mode 100644 index 0000000000000000000000000000000000000000..de150d1a6e83d597b348d8d349bd2bc3affe5ac7 --- /dev/null +++ b/requirements-uv.txt @@ -0,0 +1,393 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile requirements.txt -o requirements-uv.txt --index-strategy unsafe-best-match --no-build-isolation -p 3.10 +accelerate==1.3.0 + # via + # -r requirements.txt + # mmgp + # peft +aiofiles==23.2.1 + # via gradio +annotated-types==0.7.0 + # via pydantic +antlr4-python3-runtime==4.9.3 + # via omegaconf +anyio==4.8.0 + # via + # gradio + # httpx + # starlette +attrs==24.3.0 + # via + # jsonschema + # referencing +certifi==2024.12.14 + # via + # httpcore + # httpx + # requests +charset-normalizer==3.4.1 + # via requests +click==8.1.8 + # via + # typer + # uvicorn +colorama==0.4.6 + # via + # click + # tqdm +coloredlogs==15.0.1 + # via onnxruntime +contourpy==1.3.1 + # via matplotlib +cycler==0.12.1 + # via matplotlib +dataclasses-json==0.6.7 + # via pygltflib +deprecated==1.2.15 + # via pygltflib +diffusers==0.32.2 + # via -r requirements.txt +einops==0.8.0 + # via -r requirements.txt +exceptiongroup==1.2.2 + # via anyio +fastapi==0.115.6 + # via + # -r requirements.txt + # gradio +ffmpy==0.5.0 + # via gradio +filelock==3.16.1 + # via + # diffusers + # huggingface-hub + # torch + # transformers +flatbuffers==24.12.23 + # via onnxruntime +fonttools==4.55.3 + # via matplotlib +fsspec==2024.12.0 + # via + # gradio-client + # huggingface-hub + # torch +gradio==4.44.1 + # via + # -r requirements.txt + # gradio-litmodel3d +gradio-client==1.3.0 + # via gradio +gradio-litmodel3d==0.0.1 + # via -r requirements.txt +h11==0.14.0 + # via + # httpcore + # uvicorn +httpcore==1.0.7 + # via httpx +httpx==0.28.1 + # via + # gradio + # gradio-client +huggingface-hub==0.27.1 + # via + # accelerate + # diffusers + # gradio + # gradio-client + # optimum-quanto + # peft + # tokenizers + # transformers +humanfriendly==10.0 + # via coloredlogs +idna==3.10 + # via + # anyio + # httpx + # requests +imageio==2.37.0 + # via scikit-image +importlib-metadata==8.6.1 + # via diffusers +importlib-resources==6.5.2 + # via gradio +jinja2==3.1.5 + # via + # gradio + # torch +jsonschema==4.23.0 + # via rembg +jsonschema-specifications==2024.10.1 + # via jsonschema +kiwisolver==1.4.8 + # via matplotlib +lazy-loader==0.4 + # via scikit-image +llvmlite==0.44.0 + # via numba +markdown-it-py==3.0.0 + # via rich +markupsafe==2.1.5 + # via + # gradio + # jinja2 +marshmallow==3.25.1 + # via dataclasses-json +matplotlib==3.10.0 + # via gradio +mdurl==0.1.2 + # via markdown-it-py +mmgp==3.1.3 + # via -r requirements.txt +mpmath==1.3.0 + # via sympy +msvc-runtime==14.42.34433 + # via pymeshlab +mypy-extensions==1.0.0 + # via typing-inspect +networkx==3.4.2 + # via + # scikit-image + # torch +ninja==1.11.1.3 + # via + # -r requirements.txt + # optimum-quanto +numba==0.61.0 + # via pymatting +numpy==2.1.3 + # via + # -r requirements.txt + # accelerate + # contourpy + # diffusers + # gradio + # imageio + # matplotlib + # numba + # onnxruntime + # opencv-python + # opencv-python-headless + # optimum-quanto + # pandas + # peft + # pymatting + # pymeshlab + # rembg + # scikit-image + # scipy + # tifffile + # torchvision + # transformers + # trimesh + # xatlas +omegaconf==2.3.0 + # via -r requirements.txt +onnxruntime==1.20.1 + # via -r requirements.txt +opencv-python==4.11.0.86 + # via -r requirements.txt +opencv-python-headless==4.11.0.86 + # via rembg +optimum-quanto==0.2.6 + # via mmgp +orjson==3.10.15 + # via gradio +packaging==24.2 + # via + # accelerate + # gradio + # gradio-client + # huggingface-hub + # lazy-loader + # marshmallow + # matplotlib + # onnxruntime + # peft + # pooch + # scikit-image + # transformers +pandas==2.2.3 + # via gradio +peft==0.14.0 + # via mmgp +pillow==10.4.0 + # via + # diffusers + # gradio + # imageio + # matplotlib + # pymatting + # rembg + # scikit-image + # torchvision +platformdirs==4.3.6 + # via pooch +pooch==1.8.2 + # via rembg +protobuf==5.29.3 + # via onnxruntime +psutil==6.1.1 + # via + # accelerate + # mmgp + # peft +pybind11==2.13.6 + # via -r requirements.txt +pydantic==2.10.5 + # via + # fastapi + # gradio +pydantic-core==2.27.2 + # via pydantic +pydub==0.25.1 + # via gradio +pygltflib==1.16.3 + # via -r requirements.txt +pygments==2.19.1 + # via rich +pymatting==1.1.13 + # via rembg +pymeshlab==2023.12.post2 + # via -r requirements.txt +pyparsing==3.2.1 + # via matplotlib +pyreadline3==3.5.4 + # via humanfriendly +python-dateutil==2.9.0.post0 + # via + # matplotlib + # pandas +python-multipart==0.0.20 + # via gradio +pytz==2024.2 + # via pandas +pyyaml==6.0.2 + # via + # accelerate + # gradio + # huggingface-hub + # omegaconf + # peft + # transformers +referencing==0.36.1 + # via + # jsonschema + # jsonschema-specifications +regex==2024.11.6 + # via + # diffusers + # transformers +rembg==2.0.61 + # via -r requirements.txt +requests==2.32.3 + # via + # diffusers + # huggingface-hub + # pooch + # transformers +rich==13.9.4 + # via typer +rpds-py==0.22.3 + # via + # jsonschema + # referencing +ruff==0.9.2 + # via gradio +safetensors==0.5.2 + # via + # accelerate + # diffusers + # mmgp + # optimum-quanto + # peft + # transformers +scikit-image==0.25.0 + # via rembg +scipy==1.15.1 + # via + # pymatting + # rembg + # scikit-image +semantic-version==2.10.0 + # via gradio +shellingham==1.5.4 + # via typer +six==1.17.0 + # via python-dateutil +sniffio==1.3.1 + # via anyio +starlette==0.41.3 + # via fastapi +sympy==1.13.1 + # via + # onnxruntime + # torch +tifffile==2025.1.10 + # via scikit-image +tokenizers==0.21.0 + # via transformers +tomlkit==0.12.0 + # via gradio +torch==2.6.0 + # via + # -r requirements.txt + # accelerate + # mmgp + # optimum-quanto + # peft + # torchvision +torchvision==0.21.0 + # via -r requirements.txt +tqdm==4.67.1 + # via + # -r requirements.txt + # huggingface-hub + # peft + # rembg + # transformers +transformers==4.48.1 + # via + # -r requirements.txt + # peft +trimesh==4.5.3 + # via -r requirements.txt +typer==0.15.1 + # via gradio +typing-extensions==4.12.2 + # via + # anyio + # fastapi + # gradio + # gradio-client + # huggingface-hub + # pydantic + # pydantic-core + # referencing + # rich + # torch + # typer + # typing-inspect + # uvicorn +typing-inspect==0.9.0 + # via dataclasses-json +tzdata==2024.2 + # via pandas +urllib3==2.3.0 + # via + # gradio + # requests +uvicorn==0.34.0 + # via + # -r requirements.txt + # gradio +websockets==12.0 + # via gradio-client +wrapt==1.17.2 + # via deprecated +xatlas==0.0.9 + # via -r requirements.txt +zipp==3.21.0 + # via importlib-metadata diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..6f9b478720d140e553e5740a4e0074e2eb252678 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,43 @@ +gradio_litmodel3d +ninja +pybind11 + +diffusers +einops +opencv-python +numpy +torch +transformers +torchvision +#taming-transformers-rom1504 +#ConfigArgParse +#ipdb +omegaconf + +#sentencepiece +tqdm + +# Mesh Processing +trimesh +pymeshlab +pygltflib +xatlas +#kornia +#facexlib + +# Training +accelerate +#pytorch_lightning +#scikit-learn +#scikit-image + +# Demo only +gradio +uvicorn +fastapi +uvicorn +rembg +onnxruntime +#gevent +#geventhttpclient +mmgp==3.1.3 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..2e9b8d028efcd2c63b49ee388e221fa380ba1787 --- /dev/null +++ b/setup.py @@ -0,0 +1,32 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +from setuptools import setup, find_packages + +setup( + name="hy3dgen", + version="2.0.0", + packages=find_packages(), + +)