Spaces:
Runtime error
Runtime error
syedMohib44
commited on
Commit
·
00e5927
1
Parent(s):
f4ae690
init
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +1 -35
- .gitignore +28 -0
- README.md +15 -5
- assets/shoes.png +0 -0
- build/lib/hy3dgen/__init__.py +23 -0
- build/lib/hy3dgen/rembg.py +36 -0
- build/lib/hy3dgen/shapegen/__init__.py +27 -0
- build/lib/hy3dgen/shapegen/models/__init__.py +28 -0
- build/lib/hy3dgen/shapegen/models/conditioner.py +165 -0
- build/lib/hy3dgen/shapegen/models/hunyuan3ddit.py +390 -0
- build/lib/hy3dgen/shapegen/models/vae.py +636 -0
- build/lib/hy3dgen/shapegen/pipelines.py +589 -0
- build/lib/hy3dgen/shapegen/postprocessors.py +175 -0
- build/lib/hy3dgen/shapegen/preprocessors.py +127 -0
- build/lib/hy3dgen/shapegen/schedulers.py +307 -0
- build/lib/hy3dgen/texgen/__init__.py +26 -0
- build/lib/hy3dgen/texgen/differentiable_renderer/__init__.py +23 -0
- build/lib/hy3dgen/texgen/differentiable_renderer/camera_utils.py +116 -0
- build/lib/hy3dgen/texgen/differentiable_renderer/mesh_processor.py +70 -0
- build/lib/hy3dgen/texgen/differentiable_renderer/mesh_render.py +833 -0
- build/lib/hy3dgen/texgen/differentiable_renderer/mesh_utils.py +44 -0
- build/lib/hy3dgen/texgen/differentiable_renderer/setup.py +48 -0
- build/lib/hy3dgen/texgen/hunyuanpaint/__init__.py +23 -0
- build/lib/hy3dgen/texgen/hunyuanpaint/pipeline.py +554 -0
- build/lib/hy3dgen/texgen/hunyuanpaint/unet/__init__.py +23 -0
- build/lib/hy3dgen/texgen/hunyuanpaint/unet/modules.py +440 -0
- build/lib/hy3dgen/texgen/pipelines.py +227 -0
- build/lib/hy3dgen/texgen/utils/__init__.py +23 -0
- build/lib/hy3dgen/texgen/utils/alignImg4Tex_utils.py +132 -0
- build/lib/hy3dgen/texgen/utils/counter_utils.py +58 -0
- build/lib/hy3dgen/texgen/utils/dehighlight_utils.py +84 -0
- build/lib/hy3dgen/texgen/utils/multiview_utils.py +86 -0
- build/lib/hy3dgen/texgen/utils/simplify_mesh_utils.py +46 -0
- build/lib/hy3dgen/texgen/utils/uv_warp_utils.py +42 -0
- build/lib/hy3dgen/text2image.py +93 -0
- dist/hy3dgen-2.0.0-py3.12.egg +0 -0
- hy3dgen.egg-info/PKG-INFO +3 -0
- hy3dgen.egg-info/SOURCES.txt +37 -0
- hy3dgen.egg-info/dependency_links.txt +1 -0
- hy3dgen.egg-info/top_level.txt +1 -0
- hy3dgen/__init__.py +23 -0
- hy3dgen/rembg.py +36 -0
- hy3dgen/shapegen/__init__.py +27 -0
- hy3dgen/shapegen/models/__init__.py +28 -0
- hy3dgen/shapegen/models/conditioner.py +165 -0
- hy3dgen/shapegen/models/hunyuan3ddit.py +390 -0
- hy3dgen/shapegen/models/vae.py +636 -0
- hy3dgen/shapegen/pipelines.py +589 -0
- hy3dgen/shapegen/postprocessors.py +175 -0
- hy3dgen/shapegen/preprocessors.py +127 -0
.gitattributes
CHANGED
@@ -1,35 +1 @@
|
|
1 |
-
*.
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
1 |
+
*.obj filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*.so
|
5 |
+
|
6 |
+
# Virtual environment
|
7 |
+
venv/
|
8 |
+
env/
|
9 |
+
.venv/
|
10 |
+
|
11 |
+
# Jupyter Notebook checkpoints
|
12 |
+
.ipynb_checkpoints/
|
13 |
+
|
14 |
+
# Logs and local environment files
|
15 |
+
*.log
|
16 |
+
*.env
|
17 |
+
.env.local
|
18 |
+
|
19 |
+
# PyTorch or TensorFlow saved models
|
20 |
+
*.pt
|
21 |
+
*.pth
|
22 |
+
*.h5
|
23 |
+
|
24 |
+
# VSCode settings (if using VSCode)
|
25 |
+
.vscode/
|
26 |
+
|
27 |
+
# Hugging Face cache (optional)
|
28 |
+
~/.cache/huggingface/
|
README.md
CHANGED
@@ -1,14 +1,24 @@
|
|
1 |
---
|
2 |
-
title: Ditto
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.17.1
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
11 |
-
short_description:
|
12 |
---
|
13 |
|
14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: Ditto
|
3 |
+
emoji: 🐢
|
4 |
+
colorFrom: yellow
|
5 |
+
colorTo: yellow
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.17.1
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
11 |
+
short_description: Image to 3D object generator
|
12 |
---
|
13 |
|
14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
15 |
+
|
16 |
+
# Setup
|
17 |
+
|
18 |
+
```
|
19 |
+
uv pip compile requirements.txt -o requirements-uv.txt --index-strategy unsafe-best-match --no-build-isolation -p 3.10
|
20 |
+
|
21 |
+
pip install -r requirements.txt
|
22 |
+
|
23 |
+
python setup.py install
|
24 |
+
```
|
assets/shoes.png
ADDED
![]() |
build/lib/hy3dgen/__init__.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Open Source Model Licensed under the Apache License Version 2.0
|
2 |
+
# and Other Licenses of the Third-Party Components therein:
|
3 |
+
# The below Model in this distribution may have been modified by THL A29 Limited
|
4 |
+
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
|
5 |
+
|
6 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
7 |
+
# The below software and/or models in this distribution may have been
|
8 |
+
# modified by THL A29 Limited ("Tencent Modifications").
|
9 |
+
# All Tencent Modifications are Copyright (C) THL A29 Limited.
|
10 |
+
|
11 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
12 |
+
# except for the third-party components listed below.
|
13 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
14 |
+
# in the repsective licenses of these third-party components.
|
15 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
16 |
+
# components and must ensure that the usage of the third party components adheres to
|
17 |
+
# all relevant laws and regulations.
|
18 |
+
|
19 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
20 |
+
# their software and algorithms, including trained model weights, parameters (including
|
21 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
22 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
23 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
build/lib/hy3dgen/rembg.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Open Source Model Licensed under the Apache License Version 2.0
|
2 |
+
# and Other Licenses of the Third-Party Components therein:
|
3 |
+
# The below Model in this distribution may have been modified by THL A29 Limited
|
4 |
+
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
|
5 |
+
|
6 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
7 |
+
# The below software and/or models in this distribution may have been
|
8 |
+
# modified by THL A29 Limited ("Tencent Modifications").
|
9 |
+
# All Tencent Modifications are Copyright (C) THL A29 Limited.
|
10 |
+
|
11 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
12 |
+
# except for the third-party components listed below.
|
13 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
14 |
+
# in the repsective licenses of these third-party components.
|
15 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
16 |
+
# components and must ensure that the usage of the third party components adheres to
|
17 |
+
# all relevant laws and regulations.
|
18 |
+
|
19 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
20 |
+
# their software and algorithms, including trained model weights, parameters (including
|
21 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
22 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
23 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
24 |
+
|
25 |
+
|
26 |
+
from PIL import Image
|
27 |
+
from rembg import remove, new_session
|
28 |
+
|
29 |
+
|
30 |
+
class BackgroundRemover():
|
31 |
+
def __init__(self):
|
32 |
+
self.session = new_session()
|
33 |
+
|
34 |
+
def __call__(self, image: Image.Image):
|
35 |
+
output = remove(image, session=self.session, bgcolor=[255, 255, 255, 0])
|
36 |
+
return output
|
build/lib/hy3dgen/shapegen/__init__.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Open Source Model Licensed under the Apache License Version 2.0
|
2 |
+
# and Other Licenses of the Third-Party Components therein:
|
3 |
+
# The below Model in this distribution may have been modified by THL A29 Limited
|
4 |
+
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
|
5 |
+
|
6 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
7 |
+
# The below software and/or models in this distribution may have been
|
8 |
+
# modified by THL A29 Limited ("Tencent Modifications").
|
9 |
+
# All Tencent Modifications are Copyright (C) THL A29 Limited.
|
10 |
+
|
11 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
12 |
+
# except for the third-party components listed below.
|
13 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
14 |
+
# in the repsective licenses of these third-party components.
|
15 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
16 |
+
# components and must ensure that the usage of the third party components adheres to
|
17 |
+
# all relevant laws and regulations.
|
18 |
+
|
19 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
20 |
+
# their software and algorithms, including trained model weights, parameters (including
|
21 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
22 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
23 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
24 |
+
|
25 |
+
from .pipelines import Hunyuan3DDiTPipeline, Hunyuan3DDiTFlowMatchingPipeline
|
26 |
+
from .postprocessors import FaceReducer, FloaterRemover, DegenerateFaceRemover
|
27 |
+
from .preprocessors import ImageProcessorV2, IMAGE_PROCESSORS, DEFAULT_IMAGEPROCESSOR
|
build/lib/hy3dgen/shapegen/models/__init__.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Open Source Model Licensed under the Apache License Version 2.0
|
2 |
+
# and Other Licenses of the Third-Party Components therein:
|
3 |
+
# The below Model in this distribution may have been modified by THL A29 Limited
|
4 |
+
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
|
5 |
+
|
6 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
7 |
+
# The below software and/or models in this distribution may have been
|
8 |
+
# modified by THL A29 Limited ("Tencent Modifications").
|
9 |
+
# All Tencent Modifications are Copyright (C) THL A29 Limited.
|
10 |
+
|
11 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
12 |
+
# except for the third-party components listed below.
|
13 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
14 |
+
# in the repsective licenses of these third-party components.
|
15 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
16 |
+
# components and must ensure that the usage of the third party components adheres to
|
17 |
+
# all relevant laws and regulations.
|
18 |
+
|
19 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
20 |
+
# their software and algorithms, including trained model weights, parameters (including
|
21 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
22 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
23 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
24 |
+
|
25 |
+
|
26 |
+
from .conditioner import DualImageEncoder, SingleImageEncoder, DinoImageEncoder, CLIPImageEncoder
|
27 |
+
from .hunyuan3ddit import Hunyuan3DDiT
|
28 |
+
from .vae import ShapeVAE
|
build/lib/hy3dgen/shapegen/models/conditioner.py
ADDED
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Open Source Model Licensed under the Apache License Version 2.0
|
2 |
+
# and Other Licenses of the Third-Party Components therein:
|
3 |
+
# The below Model in this distribution may have been modified by THL A29 Limited
|
4 |
+
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
|
5 |
+
|
6 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
7 |
+
# The below software and/or models in this distribution may have been
|
8 |
+
# modified by THL A29 Limited ("Tencent Modifications").
|
9 |
+
# All Tencent Modifications are Copyright (C) THL A29 Limited.
|
10 |
+
|
11 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
12 |
+
# except for the third-party components listed below.
|
13 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
14 |
+
# in the repsective licenses of these third-party components.
|
15 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
16 |
+
# components and must ensure that the usage of the third party components adheres to
|
17 |
+
# all relevant laws and regulations.
|
18 |
+
|
19 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
20 |
+
# their software and algorithms, including trained model weights, parameters (including
|
21 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
22 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
23 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
24 |
+
|
25 |
+
import torch
|
26 |
+
import torch.nn as nn
|
27 |
+
from torchvision import transforms
|
28 |
+
from transformers import (
|
29 |
+
CLIPVisionModelWithProjection,
|
30 |
+
CLIPVisionConfig,
|
31 |
+
Dinov2Model,
|
32 |
+
Dinov2Config,
|
33 |
+
)
|
34 |
+
|
35 |
+
|
36 |
+
class ImageEncoder(nn.Module):
|
37 |
+
def __init__(
|
38 |
+
self,
|
39 |
+
version=None,
|
40 |
+
config=None,
|
41 |
+
use_cls_token=True,
|
42 |
+
image_size=224,
|
43 |
+
**kwargs,
|
44 |
+
):
|
45 |
+
super().__init__()
|
46 |
+
|
47 |
+
if config is None:
|
48 |
+
self.model = self.MODEL_CLASS.from_pretrained(version)
|
49 |
+
else:
|
50 |
+
self.model = self.MODEL_CLASS(self.MODEL_CONFIG_CLASS.from_dict(config))
|
51 |
+
self.model.eval()
|
52 |
+
self.model.requires_grad_(False)
|
53 |
+
self.use_cls_token = use_cls_token
|
54 |
+
self.size = image_size // 14
|
55 |
+
self.num_patches = (image_size // 14) ** 2
|
56 |
+
if self.use_cls_token:
|
57 |
+
self.num_patches += 1
|
58 |
+
|
59 |
+
self.transform = transforms.Compose(
|
60 |
+
[
|
61 |
+
transforms.Resize(image_size, transforms.InterpolationMode.BILINEAR, antialias=True),
|
62 |
+
transforms.CenterCrop(image_size),
|
63 |
+
transforms.Normalize(
|
64 |
+
mean=self.mean,
|
65 |
+
std=self.std,
|
66 |
+
),
|
67 |
+
]
|
68 |
+
)
|
69 |
+
|
70 |
+
def forward(self, image, mask=None, value_range=(-1, 1)):
|
71 |
+
if value_range is not None:
|
72 |
+
low, high = value_range
|
73 |
+
image = (image - low) / (high - low)
|
74 |
+
|
75 |
+
image = image.to(self.model.device, dtype=self.model.dtype)
|
76 |
+
inputs = self.transform(image)
|
77 |
+
outputs = self.model(inputs)
|
78 |
+
|
79 |
+
last_hidden_state = outputs.last_hidden_state
|
80 |
+
if not self.use_cls_token:
|
81 |
+
last_hidden_state = last_hidden_state[:, 1:, :]
|
82 |
+
|
83 |
+
return last_hidden_state
|
84 |
+
|
85 |
+
def unconditional_embedding(self, batch_size):
|
86 |
+
device = next(self.model.parameters()).device
|
87 |
+
dtype = next(self.model.parameters()).dtype
|
88 |
+
zero = torch.zeros(
|
89 |
+
batch_size,
|
90 |
+
self.num_patches,
|
91 |
+
self.model.config.hidden_size,
|
92 |
+
device=device,
|
93 |
+
dtype=dtype,
|
94 |
+
)
|
95 |
+
|
96 |
+
return zero
|
97 |
+
|
98 |
+
|
99 |
+
class CLIPImageEncoder(ImageEncoder):
|
100 |
+
MODEL_CLASS = CLIPVisionModelWithProjection
|
101 |
+
MODEL_CONFIG_CLASS = CLIPVisionConfig
|
102 |
+
mean = [0.48145466, 0.4578275, 0.40821073]
|
103 |
+
std = [0.26862954, 0.26130258, 0.27577711]
|
104 |
+
|
105 |
+
|
106 |
+
class DinoImageEncoder(ImageEncoder):
|
107 |
+
MODEL_CLASS = Dinov2Model
|
108 |
+
MODEL_CONFIG_CLASS = Dinov2Config
|
109 |
+
mean = [0.485, 0.456, 0.406]
|
110 |
+
std = [0.229, 0.224, 0.225]
|
111 |
+
|
112 |
+
|
113 |
+
def build_image_encoder(config):
|
114 |
+
if config['type'] == 'CLIPImageEncoder':
|
115 |
+
return CLIPImageEncoder(**config['kwargs'])
|
116 |
+
elif config['type'] == 'DinoImageEncoder':
|
117 |
+
return DinoImageEncoder(**config['kwargs'])
|
118 |
+
else:
|
119 |
+
raise ValueError(f'Unknown image encoder type: {config["type"]}')
|
120 |
+
|
121 |
+
|
122 |
+
class DualImageEncoder(nn.Module):
|
123 |
+
def __init__(
|
124 |
+
self,
|
125 |
+
main_image_encoder,
|
126 |
+
additional_image_encoder,
|
127 |
+
):
|
128 |
+
super().__init__()
|
129 |
+
self.main_image_encoder = build_image_encoder(main_image_encoder)
|
130 |
+
self.additional_image_encoder = build_image_encoder(additional_image_encoder)
|
131 |
+
|
132 |
+
def forward(self, image, mask=None):
|
133 |
+
outputs = {
|
134 |
+
'main': self.main_image_encoder(image, mask=mask),
|
135 |
+
'additional': self.additional_image_encoder(image, mask=mask),
|
136 |
+
}
|
137 |
+
return outputs
|
138 |
+
|
139 |
+
def unconditional_embedding(self, batch_size):
|
140 |
+
outputs = {
|
141 |
+
'main': self.main_image_encoder.unconditional_embedding(batch_size),
|
142 |
+
'additional': self.additional_image_encoder.unconditional_embedding(batch_size),
|
143 |
+
}
|
144 |
+
return outputs
|
145 |
+
|
146 |
+
|
147 |
+
class SingleImageEncoder(nn.Module):
|
148 |
+
def __init__(
|
149 |
+
self,
|
150 |
+
main_image_encoder,
|
151 |
+
):
|
152 |
+
super().__init__()
|
153 |
+
self.main_image_encoder = build_image_encoder(main_image_encoder)
|
154 |
+
|
155 |
+
def forward(self, image, mask=None):
|
156 |
+
outputs = {
|
157 |
+
'main': self.main_image_encoder(image, mask=mask),
|
158 |
+
}
|
159 |
+
return outputs
|
160 |
+
|
161 |
+
def unconditional_embedding(self, batch_size):
|
162 |
+
outputs = {
|
163 |
+
'main': self.main_image_encoder.unconditional_embedding(batch_size),
|
164 |
+
}
|
165 |
+
return outputs
|
build/lib/hy3dgen/shapegen/models/hunyuan3ddit.py
ADDED
@@ -0,0 +1,390 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Open Source Model Licensed under the Apache License Version 2.0
|
2 |
+
# and Other Licenses of the Third-Party Components therein:
|
3 |
+
# The below Model in this distribution may have been modified by THL A29 Limited
|
4 |
+
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
|
5 |
+
|
6 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
7 |
+
# The below software and/or models in this distribution may have been
|
8 |
+
# modified by THL A29 Limited ("Tencent Modifications").
|
9 |
+
# All Tencent Modifications are Copyright (C) THL A29 Limited.
|
10 |
+
|
11 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
12 |
+
# except for the third-party components listed below.
|
13 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
14 |
+
# in the repsective licenses of these third-party components.
|
15 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
16 |
+
# components and must ensure that the usage of the third party components adheres to
|
17 |
+
# all relevant laws and regulations.
|
18 |
+
|
19 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
20 |
+
# their software and algorithms, including trained model weights, parameters (including
|
21 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
22 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
23 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
24 |
+
|
25 |
+
import math
|
26 |
+
from dataclasses import dataclass
|
27 |
+
from typing import List, Tuple, Optional
|
28 |
+
|
29 |
+
import torch
|
30 |
+
from einops import rearrange
|
31 |
+
from torch import Tensor, nn
|
32 |
+
|
33 |
+
|
34 |
+
def attention(q: Tensor, k: Tensor, v: Tensor, **kwargs) -> Tensor:
|
35 |
+
x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
|
36 |
+
x = rearrange(x, "B H L D -> B L (H D)")
|
37 |
+
return x
|
38 |
+
|
39 |
+
|
40 |
+
def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 1000.0):
|
41 |
+
"""
|
42 |
+
Create sinusoidal timestep embeddings.
|
43 |
+
:param t: a 1-D Tensor of N indices, one per batch element.
|
44 |
+
These may be fractional.
|
45 |
+
:param dim: the dimension of the output.
|
46 |
+
:param max_period: controls the minimum frequency of the embeddings.
|
47 |
+
:return: an (N, D) Tensor of positional embeddings.
|
48 |
+
"""
|
49 |
+
t = time_factor * t
|
50 |
+
half = dim // 2
|
51 |
+
freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(
|
52 |
+
t.device
|
53 |
+
)
|
54 |
+
|
55 |
+
args = t[:, None].float() * freqs[None]
|
56 |
+
embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
|
57 |
+
if dim % 2:
|
58 |
+
embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
|
59 |
+
if torch.is_floating_point(t):
|
60 |
+
embedding = embedding.to(t)
|
61 |
+
return embedding
|
62 |
+
|
63 |
+
|
64 |
+
class MLPEmbedder(nn.Module):
|
65 |
+
def __init__(self, in_dim: int, hidden_dim: int):
|
66 |
+
super().__init__()
|
67 |
+
self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
|
68 |
+
self.silu = nn.SiLU()
|
69 |
+
self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
|
70 |
+
|
71 |
+
def forward(self, x: Tensor) -> Tensor:
|
72 |
+
return self.out_layer(self.silu(self.in_layer(x)))
|
73 |
+
|
74 |
+
|
75 |
+
class RMSNorm(torch.nn.Module):
|
76 |
+
def __init__(self, dim: int):
|
77 |
+
super().__init__()
|
78 |
+
self.scale = nn.Parameter(torch.ones(dim))
|
79 |
+
|
80 |
+
def forward(self, x: Tensor):
|
81 |
+
x_dtype = x.dtype
|
82 |
+
x = x.float()
|
83 |
+
rrms = torch.rsqrt(torch.mean(x ** 2, dim=-1, keepdim=True) + 1e-6)
|
84 |
+
return (x * rrms).to(dtype=x_dtype) * self.scale
|
85 |
+
|
86 |
+
|
87 |
+
class QKNorm(torch.nn.Module):
|
88 |
+
def __init__(self, dim: int):
|
89 |
+
super().__init__()
|
90 |
+
self.query_norm = RMSNorm(dim)
|
91 |
+
self.key_norm = RMSNorm(dim)
|
92 |
+
|
93 |
+
def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tuple[Tensor, Tensor]:
|
94 |
+
q = self.query_norm(q)
|
95 |
+
k = self.key_norm(k)
|
96 |
+
return q.to(v), k.to(v)
|
97 |
+
|
98 |
+
|
99 |
+
class SelfAttention(nn.Module):
|
100 |
+
def __init__(
|
101 |
+
self,
|
102 |
+
dim: int,
|
103 |
+
num_heads: int = 8,
|
104 |
+
qkv_bias: bool = False,
|
105 |
+
):
|
106 |
+
super().__init__()
|
107 |
+
self.num_heads = num_heads
|
108 |
+
head_dim = dim // num_heads
|
109 |
+
|
110 |
+
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
|
111 |
+
self.norm = QKNorm(head_dim)
|
112 |
+
self.proj = nn.Linear(dim, dim)
|
113 |
+
|
114 |
+
def forward(self, x: Tensor, pe: Tensor) -> Tensor:
|
115 |
+
qkv = self.qkv(x)
|
116 |
+
q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
|
117 |
+
q, k = self.norm(q, k, v)
|
118 |
+
x = attention(q, k, v, pe=pe)
|
119 |
+
x = self.proj(x)
|
120 |
+
return x
|
121 |
+
|
122 |
+
|
123 |
+
@dataclass
|
124 |
+
class ModulationOut:
|
125 |
+
shift: Tensor
|
126 |
+
scale: Tensor
|
127 |
+
gate: Tensor
|
128 |
+
|
129 |
+
|
130 |
+
class Modulation(nn.Module):
|
131 |
+
def __init__(self, dim: int, double: bool):
|
132 |
+
super().__init__()
|
133 |
+
self.is_double = double
|
134 |
+
self.multiplier = 6 if double else 3
|
135 |
+
self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)
|
136 |
+
|
137 |
+
def forward(self, vec: Tensor) -> Tuple[ModulationOut, Optional[ModulationOut]]:
|
138 |
+
out = self.lin(nn.functional.silu(vec))[:, None, :]
|
139 |
+
out = out.chunk(self.multiplier, dim=-1)
|
140 |
+
|
141 |
+
return (
|
142 |
+
ModulationOut(*out[:3]),
|
143 |
+
ModulationOut(*out[3:]) if self.is_double else None,
|
144 |
+
)
|
145 |
+
|
146 |
+
|
147 |
+
class DoubleStreamBlock(nn.Module):
|
148 |
+
def __init__(
|
149 |
+
self,
|
150 |
+
hidden_size: int,
|
151 |
+
num_heads: int,
|
152 |
+
mlp_ratio: float,
|
153 |
+
qkv_bias: bool = False,
|
154 |
+
):
|
155 |
+
super().__init__()
|
156 |
+
mlp_hidden_dim = int(hidden_size * mlp_ratio)
|
157 |
+
self.num_heads = num_heads
|
158 |
+
self.hidden_size = hidden_size
|
159 |
+
self.img_mod = Modulation(hidden_size, double=True)
|
160 |
+
self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
|
161 |
+
self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
|
162 |
+
|
163 |
+
self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
|
164 |
+
self.img_mlp = nn.Sequential(
|
165 |
+
nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
|
166 |
+
nn.GELU(approximate="tanh"),
|
167 |
+
nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
|
168 |
+
)
|
169 |
+
|
170 |
+
self.txt_mod = Modulation(hidden_size, double=True)
|
171 |
+
self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
|
172 |
+
self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
|
173 |
+
|
174 |
+
self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
|
175 |
+
self.txt_mlp = nn.Sequential(
|
176 |
+
nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
|
177 |
+
nn.GELU(approximate="tanh"),
|
178 |
+
nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
|
179 |
+
)
|
180 |
+
|
181 |
+
def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor) -> Tuple[Tensor, Tensor]:
|
182 |
+
img_mod1, img_mod2 = self.img_mod(vec)
|
183 |
+
txt_mod1, txt_mod2 = self.txt_mod(vec)
|
184 |
+
|
185 |
+
img_modulated = self.img_norm1(img)
|
186 |
+
img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
|
187 |
+
img_qkv = self.img_attn.qkv(img_modulated)
|
188 |
+
img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
|
189 |
+
img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
|
190 |
+
|
191 |
+
txt_modulated = self.txt_norm1(txt)
|
192 |
+
txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
|
193 |
+
txt_qkv = self.txt_attn.qkv(txt_modulated)
|
194 |
+
txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
|
195 |
+
txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
|
196 |
+
|
197 |
+
q = torch.cat((txt_q, img_q), dim=2)
|
198 |
+
k = torch.cat((txt_k, img_k), dim=2)
|
199 |
+
v = torch.cat((txt_v, img_v), dim=2)
|
200 |
+
|
201 |
+
attn = attention(q, k, v, pe=pe)
|
202 |
+
txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1]:]
|
203 |
+
|
204 |
+
img = img + img_mod1.gate * self.img_attn.proj(img_attn)
|
205 |
+
img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)
|
206 |
+
|
207 |
+
txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
|
208 |
+
txt = txt + txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
|
209 |
+
return img, txt
|
210 |
+
|
211 |
+
|
212 |
+
class SingleStreamBlock(nn.Module):
|
213 |
+
"""
|
214 |
+
A DiT block with parallel linear layers as described in
|
215 |
+
https://arxiv.org/abs/2302.05442 and adapted modulation interface.
|
216 |
+
"""
|
217 |
+
|
218 |
+
def __init__(
|
219 |
+
self,
|
220 |
+
hidden_size: int,
|
221 |
+
num_heads: int,
|
222 |
+
mlp_ratio: float = 4.0,
|
223 |
+
qk_scale: Optional[float] = None,
|
224 |
+
):
|
225 |
+
super().__init__()
|
226 |
+
|
227 |
+
self.hidden_dim = hidden_size
|
228 |
+
self.num_heads = num_heads
|
229 |
+
head_dim = hidden_size // num_heads
|
230 |
+
self.scale = qk_scale or head_dim ** -0.5
|
231 |
+
|
232 |
+
self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
|
233 |
+
# qkv and mlp_in
|
234 |
+
self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim)
|
235 |
+
# proj and mlp_out
|
236 |
+
self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
|
237 |
+
|
238 |
+
self.norm = QKNorm(head_dim)
|
239 |
+
|
240 |
+
self.hidden_size = hidden_size
|
241 |
+
self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
|
242 |
+
|
243 |
+
self.mlp_act = nn.GELU(approximate="tanh")
|
244 |
+
self.modulation = Modulation(hidden_size, double=False)
|
245 |
+
|
246 |
+
def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
|
247 |
+
mod, _ = self.modulation(vec)
|
248 |
+
|
249 |
+
x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
|
250 |
+
qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
|
251 |
+
|
252 |
+
q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
|
253 |
+
q, k = self.norm(q, k, v)
|
254 |
+
|
255 |
+
# compute attention
|
256 |
+
attn = attention(q, k, v, pe=pe)
|
257 |
+
# compute activation in mlp stream, cat again and run second linear layer
|
258 |
+
output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
|
259 |
+
return x + mod.gate * output
|
260 |
+
|
261 |
+
|
262 |
+
class LastLayer(nn.Module):
|
263 |
+
def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
|
264 |
+
super().__init__()
|
265 |
+
self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
|
266 |
+
self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
|
267 |
+
self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True))
|
268 |
+
|
269 |
+
def forward(self, x: Tensor, vec: Tensor) -> Tensor:
|
270 |
+
shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
|
271 |
+
x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
|
272 |
+
x = self.linear(x)
|
273 |
+
return x
|
274 |
+
|
275 |
+
|
276 |
+
class Hunyuan3DDiT(nn.Module):
|
277 |
+
def __init__(
|
278 |
+
self,
|
279 |
+
in_channels: int = 64,
|
280 |
+
context_in_dim: int = 1536,
|
281 |
+
hidden_size: int = 1024,
|
282 |
+
mlp_ratio: float = 4.0,
|
283 |
+
num_heads: int = 16,
|
284 |
+
depth: int = 16,
|
285 |
+
depth_single_blocks: int = 32,
|
286 |
+
axes_dim: List[int] = [64],
|
287 |
+
theta: int = 10_000,
|
288 |
+
qkv_bias: bool = True,
|
289 |
+
time_factor: float = 1000,
|
290 |
+
ckpt_path: Optional[str] = None,
|
291 |
+
**kwargs,
|
292 |
+
):
|
293 |
+
super().__init__()
|
294 |
+
self.in_channels = in_channels
|
295 |
+
self.context_in_dim = context_in_dim
|
296 |
+
self.hidden_size = hidden_size
|
297 |
+
self.mlp_ratio = mlp_ratio
|
298 |
+
self.num_heads = num_heads
|
299 |
+
self.depth = depth
|
300 |
+
self.depth_single_blocks = depth_single_blocks
|
301 |
+
self.axes_dim = axes_dim
|
302 |
+
self.theta = theta
|
303 |
+
self.qkv_bias = qkv_bias
|
304 |
+
self.time_factor = time_factor
|
305 |
+
self.out_channels = self.in_channels
|
306 |
+
|
307 |
+
if hidden_size % num_heads != 0:
|
308 |
+
raise ValueError(
|
309 |
+
f"Hidden size {hidden_size} must be divisible by num_heads {num_heads}"
|
310 |
+
)
|
311 |
+
pe_dim = hidden_size // num_heads
|
312 |
+
if sum(axes_dim) != pe_dim:
|
313 |
+
raise ValueError(f"Got {axes_dim} but expected positional dim {pe_dim}")
|
314 |
+
self.hidden_size = hidden_size
|
315 |
+
self.num_heads = num_heads
|
316 |
+
self.latent_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
|
317 |
+
self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
|
318 |
+
self.cond_in = nn.Linear(context_in_dim, self.hidden_size)
|
319 |
+
|
320 |
+
self.double_blocks = nn.ModuleList(
|
321 |
+
[
|
322 |
+
DoubleStreamBlock(
|
323 |
+
self.hidden_size,
|
324 |
+
self.num_heads,
|
325 |
+
mlp_ratio=mlp_ratio,
|
326 |
+
qkv_bias=qkv_bias,
|
327 |
+
)
|
328 |
+
for _ in range(depth)
|
329 |
+
]
|
330 |
+
)
|
331 |
+
|
332 |
+
self.single_blocks = nn.ModuleList(
|
333 |
+
[
|
334 |
+
SingleStreamBlock(
|
335 |
+
self.hidden_size,
|
336 |
+
self.num_heads,
|
337 |
+
mlp_ratio=mlp_ratio,
|
338 |
+
)
|
339 |
+
for _ in range(depth_single_blocks)
|
340 |
+
]
|
341 |
+
)
|
342 |
+
|
343 |
+
self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
|
344 |
+
|
345 |
+
if ckpt_path is not None:
|
346 |
+
print('restored denoiser ckpt', ckpt_path)
|
347 |
+
|
348 |
+
ckpt = torch.load(ckpt_path, map_location="cpu")
|
349 |
+
if 'state_dict' not in ckpt:
|
350 |
+
# deepspeed ckpt
|
351 |
+
state_dict = {}
|
352 |
+
for k in ckpt.keys():
|
353 |
+
new_k = k.replace('_forward_module.', '')
|
354 |
+
state_dict[new_k] = ckpt[k]
|
355 |
+
else:
|
356 |
+
state_dict = ckpt["state_dict"]
|
357 |
+
|
358 |
+
final_state_dict = {}
|
359 |
+
for k, v in state_dict.items():
|
360 |
+
if k.startswith('model.'):
|
361 |
+
final_state_dict[k.replace('model.', '')] = v
|
362 |
+
else:
|
363 |
+
final_state_dict[k] = v
|
364 |
+
missing, unexpected = self.load_state_dict(final_state_dict, strict=False)
|
365 |
+
print('unexpected keys:', unexpected)
|
366 |
+
print('missing keys:', missing)
|
367 |
+
|
368 |
+
def forward(
|
369 |
+
self,
|
370 |
+
x,
|
371 |
+
t,
|
372 |
+
contexts,
|
373 |
+
**kwargs,
|
374 |
+
) -> Tensor:
|
375 |
+
cond = contexts['main']
|
376 |
+
latent = self.latent_in(x)
|
377 |
+
vec = self.time_in(timestep_embedding(t, 256, self.time_factor).to(dtype=latent.dtype))
|
378 |
+
cond = self.cond_in(cond)
|
379 |
+
pe = None
|
380 |
+
|
381 |
+
for block in self.double_blocks:
|
382 |
+
latent, cond = block(img=latent, txt=cond, vec=vec, pe=pe)
|
383 |
+
|
384 |
+
latent = torch.cat((cond, latent), 1)
|
385 |
+
for block in self.single_blocks:
|
386 |
+
latent = block(latent, vec=vec, pe=pe)
|
387 |
+
|
388 |
+
latent = latent[:, cond.shape[1]:, ...]
|
389 |
+
latent = self.final_layer(latent, vec)
|
390 |
+
return latent
|
build/lib/hy3dgen/shapegen/models/vae.py
ADDED
@@ -0,0 +1,636 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Open Source Model Licensed under the Apache License Version 2.0
|
2 |
+
# and Other Licenses of the Third-Party Components therein:
|
3 |
+
# The below Model in this distribution may have been modified by THL A29 Limited
|
4 |
+
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
|
5 |
+
|
6 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
7 |
+
# The below software and/or models in this distribution may have been
|
8 |
+
# modified by THL A29 Limited ("Tencent Modifications").
|
9 |
+
# All Tencent Modifications are Copyright (C) THL A29 Limited.
|
10 |
+
|
11 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
12 |
+
# except for the third-party components listed below.
|
13 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
14 |
+
# in the repsective licenses of these third-party components.
|
15 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
16 |
+
# components and must ensure that the usage of the third party components adheres to
|
17 |
+
# all relevant laws and regulations.
|
18 |
+
|
19 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
20 |
+
# their software and algorithms, including trained model weights, parameters (including
|
21 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
22 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
23 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
24 |
+
|
25 |
+
from typing import Tuple, List, Union, Optional
|
26 |
+
|
27 |
+
import numpy as np
|
28 |
+
import torch
|
29 |
+
import torch.nn as nn
|
30 |
+
import torch.nn.functional as F
|
31 |
+
from einops import rearrange, repeat
|
32 |
+
from skimage import measure
|
33 |
+
from tqdm import tqdm
|
34 |
+
|
35 |
+
|
36 |
+
class FourierEmbedder(nn.Module):
|
37 |
+
"""The sin/cosine positional embedding. Given an input tensor `x` of shape [n_batch, ..., c_dim], it converts
|
38 |
+
each feature dimension of `x[..., i]` into:
|
39 |
+
[
|
40 |
+
sin(x[..., i]),
|
41 |
+
sin(f_1*x[..., i]),
|
42 |
+
sin(f_2*x[..., i]),
|
43 |
+
...
|
44 |
+
sin(f_N * x[..., i]),
|
45 |
+
cos(x[..., i]),
|
46 |
+
cos(f_1*x[..., i]),
|
47 |
+
cos(f_2*x[..., i]),
|
48 |
+
...
|
49 |
+
cos(f_N * x[..., i]),
|
50 |
+
x[..., i] # only present if include_input is True.
|
51 |
+
], here f_i is the frequency.
|
52 |
+
|
53 |
+
Denote the space is [0 / num_freqs, 1 / num_freqs, 2 / num_freqs, 3 / num_freqs, ..., (num_freqs - 1) / num_freqs].
|
54 |
+
If logspace is True, then the frequency f_i is [2^(0 / num_freqs), ..., 2^(i / num_freqs), ...];
|
55 |
+
Otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1)].
|
56 |
+
|
57 |
+
Args:
|
58 |
+
num_freqs (int): the number of frequencies, default is 6;
|
59 |
+
logspace (bool): If logspace is True, then the frequency f_i is [..., 2^(i / num_freqs), ...],
|
60 |
+
otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1)];
|
61 |
+
input_dim (int): the input dimension, default is 3;
|
62 |
+
include_input (bool): include the input tensor or not, default is True.
|
63 |
+
|
64 |
+
Attributes:
|
65 |
+
frequencies (torch.Tensor): If logspace is True, then the frequency f_i is [..., 2^(i / num_freqs), ...],
|
66 |
+
otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1);
|
67 |
+
|
68 |
+
out_dim (int): the embedding size, if include_input is True, it is input_dim * (num_freqs * 2 + 1),
|
69 |
+
otherwise, it is input_dim * num_freqs * 2.
|
70 |
+
|
71 |
+
"""
|
72 |
+
|
73 |
+
def __init__(self,
|
74 |
+
num_freqs: int = 6,
|
75 |
+
logspace: bool = True,
|
76 |
+
input_dim: int = 3,
|
77 |
+
include_input: bool = True,
|
78 |
+
include_pi: bool = True) -> None:
|
79 |
+
|
80 |
+
"""The initialization"""
|
81 |
+
|
82 |
+
super().__init__()
|
83 |
+
|
84 |
+
if logspace:
|
85 |
+
frequencies = 2.0 ** torch.arange(
|
86 |
+
num_freqs,
|
87 |
+
dtype=torch.float32
|
88 |
+
)
|
89 |
+
else:
|
90 |
+
frequencies = torch.linspace(
|
91 |
+
1.0,
|
92 |
+
2.0 ** (num_freqs - 1),
|
93 |
+
num_freqs,
|
94 |
+
dtype=torch.float32
|
95 |
+
)
|
96 |
+
|
97 |
+
if include_pi:
|
98 |
+
frequencies *= torch.pi
|
99 |
+
|
100 |
+
self.register_buffer("frequencies", frequencies, persistent=False)
|
101 |
+
self.include_input = include_input
|
102 |
+
self.num_freqs = num_freqs
|
103 |
+
|
104 |
+
self.out_dim = self.get_dims(input_dim)
|
105 |
+
|
106 |
+
def get_dims(self, input_dim):
|
107 |
+
temp = 1 if self.include_input or self.num_freqs == 0 else 0
|
108 |
+
out_dim = input_dim * (self.num_freqs * 2 + temp)
|
109 |
+
|
110 |
+
return out_dim
|
111 |
+
|
112 |
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
113 |
+
""" Forward process.
|
114 |
+
|
115 |
+
Args:
|
116 |
+
x: tensor of shape [..., dim]
|
117 |
+
|
118 |
+
Returns:
|
119 |
+
embedding: an embedding of `x` of shape [..., dim * (num_freqs * 2 + temp)]
|
120 |
+
where temp is 1 if include_input is True and 0 otherwise.
|
121 |
+
"""
|
122 |
+
|
123 |
+
if self.num_freqs > 0:
|
124 |
+
embed = (x[..., None].contiguous() * self.frequencies).view(*x.shape[:-1], -1)
|
125 |
+
if self.include_input:
|
126 |
+
return torch.cat((x, embed.sin(), embed.cos()), dim=-1)
|
127 |
+
else:
|
128 |
+
return torch.cat((embed.sin(), embed.cos()), dim=-1)
|
129 |
+
else:
|
130 |
+
return x
|
131 |
+
|
132 |
+
|
133 |
+
class DropPath(nn.Module):
|
134 |
+
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
|
135 |
+
"""
|
136 |
+
|
137 |
+
def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True):
|
138 |
+
super(DropPath, self).__init__()
|
139 |
+
self.drop_prob = drop_prob
|
140 |
+
self.scale_by_keep = scale_by_keep
|
141 |
+
|
142 |
+
def forward(self, x):
|
143 |
+
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
|
144 |
+
|
145 |
+
This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
|
146 |
+
the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
|
147 |
+
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
|
148 |
+
changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
|
149 |
+
'survival rate' as the argument.
|
150 |
+
|
151 |
+
"""
|
152 |
+
if self.drop_prob == 0. or not self.training:
|
153 |
+
return x
|
154 |
+
keep_prob = 1 - self.drop_prob
|
155 |
+
shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
|
156 |
+
random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
|
157 |
+
if keep_prob > 0.0 and self.scale_by_keep:
|
158 |
+
random_tensor.div_(keep_prob)
|
159 |
+
return x * random_tensor
|
160 |
+
|
161 |
+
def extra_repr(self):
|
162 |
+
return f'drop_prob={round(self.drop_prob, 3):0.3f}'
|
163 |
+
|
164 |
+
|
165 |
+
class MLP(nn.Module):
|
166 |
+
def __init__(
|
167 |
+
self, *,
|
168 |
+
width: int,
|
169 |
+
output_width: int = None,
|
170 |
+
drop_path_rate: float = 0.0
|
171 |
+
):
|
172 |
+
super().__init__()
|
173 |
+
self.width = width
|
174 |
+
self.c_fc = nn.Linear(width, width * 4)
|
175 |
+
self.c_proj = nn.Linear(width * 4, output_width if output_width is not None else width)
|
176 |
+
self.gelu = nn.GELU()
|
177 |
+
self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
|
178 |
+
|
179 |
+
def forward(self, x):
|
180 |
+
return self.drop_path(self.c_proj(self.gelu(self.c_fc(x))))
|
181 |
+
|
182 |
+
|
183 |
+
class QKVMultiheadCrossAttention(nn.Module):
|
184 |
+
def __init__(
|
185 |
+
self,
|
186 |
+
*,
|
187 |
+
heads: int,
|
188 |
+
n_data: Optional[int] = None,
|
189 |
+
width=None,
|
190 |
+
qk_norm=False,
|
191 |
+
norm_layer=nn.LayerNorm
|
192 |
+
):
|
193 |
+
super().__init__()
|
194 |
+
self.heads = heads
|
195 |
+
self.n_data = n_data
|
196 |
+
self.q_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
|
197 |
+
self.k_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
|
198 |
+
|
199 |
+
def forward(self, q, kv):
|
200 |
+
_, n_ctx, _ = q.shape
|
201 |
+
bs, n_data, width = kv.shape
|
202 |
+
attn_ch = width // self.heads // 2
|
203 |
+
q = q.view(bs, n_ctx, self.heads, -1)
|
204 |
+
kv = kv.view(bs, n_data, self.heads, -1)
|
205 |
+
k, v = torch.split(kv, attn_ch, dim=-1)
|
206 |
+
|
207 |
+
q = self.q_norm(q)
|
208 |
+
k = self.k_norm(k)
|
209 |
+
|
210 |
+
q, k, v = map(lambda t: rearrange(t, 'b n h d -> b h n d', h=self.heads), (q, k, v))
|
211 |
+
out = F.scaled_dot_product_attention(q, k, v).transpose(1, 2).reshape(bs, n_ctx, -1)
|
212 |
+
|
213 |
+
return out
|
214 |
+
|
215 |
+
|
216 |
+
class MultiheadCrossAttention(nn.Module):
|
217 |
+
def __init__(
|
218 |
+
self,
|
219 |
+
*,
|
220 |
+
width: int,
|
221 |
+
heads: int,
|
222 |
+
qkv_bias: bool = True,
|
223 |
+
n_data: Optional[int] = None,
|
224 |
+
data_width: Optional[int] = None,
|
225 |
+
norm_layer=nn.LayerNorm,
|
226 |
+
qk_norm: bool = False
|
227 |
+
):
|
228 |
+
super().__init__()
|
229 |
+
self.n_data = n_data
|
230 |
+
self.width = width
|
231 |
+
self.heads = heads
|
232 |
+
self.data_width = width if data_width is None else data_width
|
233 |
+
self.c_q = nn.Linear(width, width, bias=qkv_bias)
|
234 |
+
self.c_kv = nn.Linear(self.data_width, width * 2, bias=qkv_bias)
|
235 |
+
self.c_proj = nn.Linear(width, width)
|
236 |
+
self.attention = QKVMultiheadCrossAttention(
|
237 |
+
heads=heads,
|
238 |
+
n_data=n_data,
|
239 |
+
width=width,
|
240 |
+
norm_layer=norm_layer,
|
241 |
+
qk_norm=qk_norm
|
242 |
+
)
|
243 |
+
|
244 |
+
def forward(self, x, data):
|
245 |
+
x = self.c_q(x)
|
246 |
+
data = self.c_kv(data)
|
247 |
+
x = self.attention(x, data)
|
248 |
+
x = self.c_proj(x)
|
249 |
+
return x
|
250 |
+
|
251 |
+
|
252 |
+
class ResidualCrossAttentionBlock(nn.Module):
|
253 |
+
def __init__(
|
254 |
+
self,
|
255 |
+
*,
|
256 |
+
n_data: Optional[int] = None,
|
257 |
+
width: int,
|
258 |
+
heads: int,
|
259 |
+
data_width: Optional[int] = None,
|
260 |
+
qkv_bias: bool = True,
|
261 |
+
norm_layer=nn.LayerNorm,
|
262 |
+
qk_norm: bool = False
|
263 |
+
):
|
264 |
+
super().__init__()
|
265 |
+
|
266 |
+
if data_width is None:
|
267 |
+
data_width = width
|
268 |
+
|
269 |
+
self.attn = MultiheadCrossAttention(
|
270 |
+
n_data=n_data,
|
271 |
+
width=width,
|
272 |
+
heads=heads,
|
273 |
+
data_width=data_width,
|
274 |
+
qkv_bias=qkv_bias,
|
275 |
+
norm_layer=norm_layer,
|
276 |
+
qk_norm=qk_norm
|
277 |
+
)
|
278 |
+
self.ln_1 = norm_layer(width, elementwise_affine=True, eps=1e-6)
|
279 |
+
self.ln_2 = norm_layer(data_width, elementwise_affine=True, eps=1e-6)
|
280 |
+
self.ln_3 = norm_layer(width, elementwise_affine=True, eps=1e-6)
|
281 |
+
self.mlp = MLP(width=width)
|
282 |
+
|
283 |
+
def forward(self, x: torch.Tensor, data: torch.Tensor):
|
284 |
+
x = x + self.attn(self.ln_1(x), self.ln_2(data))
|
285 |
+
x = x + self.mlp(self.ln_3(x))
|
286 |
+
return x
|
287 |
+
|
288 |
+
|
289 |
+
class QKVMultiheadAttention(nn.Module):
|
290 |
+
def __init__(
|
291 |
+
self,
|
292 |
+
*,
|
293 |
+
heads: int,
|
294 |
+
n_ctx: int,
|
295 |
+
width=None,
|
296 |
+
qk_norm=False,
|
297 |
+
norm_layer=nn.LayerNorm
|
298 |
+
):
|
299 |
+
super().__init__()
|
300 |
+
self.heads = heads
|
301 |
+
self.n_ctx = n_ctx
|
302 |
+
self.q_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
|
303 |
+
self.k_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
|
304 |
+
|
305 |
+
def forward(self, qkv):
|
306 |
+
bs, n_ctx, width = qkv.shape
|
307 |
+
attn_ch = width // self.heads // 3
|
308 |
+
qkv = qkv.view(bs, n_ctx, self.heads, -1)
|
309 |
+
q, k, v = torch.split(qkv, attn_ch, dim=-1)
|
310 |
+
|
311 |
+
q = self.q_norm(q)
|
312 |
+
k = self.k_norm(k)
|
313 |
+
|
314 |
+
q, k, v = map(lambda t: rearrange(t, 'b n h d -> b h n d', h=self.heads), (q, k, v))
|
315 |
+
out = F.scaled_dot_product_attention(q, k, v).transpose(1, 2).reshape(bs, n_ctx, -1)
|
316 |
+
return out
|
317 |
+
|
318 |
+
|
319 |
+
class MultiheadAttention(nn.Module):
|
320 |
+
def __init__(
|
321 |
+
self,
|
322 |
+
*,
|
323 |
+
n_ctx: int,
|
324 |
+
width: int,
|
325 |
+
heads: int,
|
326 |
+
qkv_bias: bool,
|
327 |
+
norm_layer=nn.LayerNorm,
|
328 |
+
qk_norm: bool = False,
|
329 |
+
drop_path_rate: float = 0.0
|
330 |
+
):
|
331 |
+
super().__init__()
|
332 |
+
self.n_ctx = n_ctx
|
333 |
+
self.width = width
|
334 |
+
self.heads = heads
|
335 |
+
self.c_qkv = nn.Linear(width, width * 3, bias=qkv_bias)
|
336 |
+
self.c_proj = nn.Linear(width, width)
|
337 |
+
self.attention = QKVMultiheadAttention(
|
338 |
+
heads=heads,
|
339 |
+
n_ctx=n_ctx,
|
340 |
+
width=width,
|
341 |
+
norm_layer=norm_layer,
|
342 |
+
qk_norm=qk_norm
|
343 |
+
)
|
344 |
+
self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
|
345 |
+
|
346 |
+
def forward(self, x):
|
347 |
+
x = self.c_qkv(x)
|
348 |
+
x = self.attention(x)
|
349 |
+
x = self.drop_path(self.c_proj(x))
|
350 |
+
return x
|
351 |
+
|
352 |
+
|
353 |
+
class ResidualAttentionBlock(nn.Module):
|
354 |
+
def __init__(
|
355 |
+
self,
|
356 |
+
*,
|
357 |
+
n_ctx: int,
|
358 |
+
width: int,
|
359 |
+
heads: int,
|
360 |
+
qkv_bias: bool = True,
|
361 |
+
norm_layer=nn.LayerNorm,
|
362 |
+
qk_norm: bool = False,
|
363 |
+
drop_path_rate: float = 0.0,
|
364 |
+
):
|
365 |
+
super().__init__()
|
366 |
+
self.attn = MultiheadAttention(
|
367 |
+
n_ctx=n_ctx,
|
368 |
+
width=width,
|
369 |
+
heads=heads,
|
370 |
+
qkv_bias=qkv_bias,
|
371 |
+
norm_layer=norm_layer,
|
372 |
+
qk_norm=qk_norm,
|
373 |
+
drop_path_rate=drop_path_rate
|
374 |
+
)
|
375 |
+
self.ln_1 = norm_layer(width, elementwise_affine=True, eps=1e-6)
|
376 |
+
self.mlp = MLP(width=width, drop_path_rate=drop_path_rate)
|
377 |
+
self.ln_2 = norm_layer(width, elementwise_affine=True, eps=1e-6)
|
378 |
+
|
379 |
+
def forward(self, x: torch.Tensor):
|
380 |
+
x = x + self.attn(self.ln_1(x))
|
381 |
+
x = x + self.mlp(self.ln_2(x))
|
382 |
+
return x
|
383 |
+
|
384 |
+
|
385 |
+
class Transformer(nn.Module):
|
386 |
+
def __init__(
|
387 |
+
self,
|
388 |
+
*,
|
389 |
+
n_ctx: int,
|
390 |
+
width: int,
|
391 |
+
layers: int,
|
392 |
+
heads: int,
|
393 |
+
qkv_bias: bool = True,
|
394 |
+
norm_layer=nn.LayerNorm,
|
395 |
+
qk_norm: bool = False,
|
396 |
+
drop_path_rate: float = 0.0
|
397 |
+
):
|
398 |
+
super().__init__()
|
399 |
+
self.n_ctx = n_ctx
|
400 |
+
self.width = width
|
401 |
+
self.layers = layers
|
402 |
+
self.resblocks = nn.ModuleList(
|
403 |
+
[
|
404 |
+
ResidualAttentionBlock(
|
405 |
+
n_ctx=n_ctx,
|
406 |
+
width=width,
|
407 |
+
heads=heads,
|
408 |
+
qkv_bias=qkv_bias,
|
409 |
+
norm_layer=norm_layer,
|
410 |
+
qk_norm=qk_norm,
|
411 |
+
drop_path_rate=drop_path_rate
|
412 |
+
)
|
413 |
+
for _ in range(layers)
|
414 |
+
]
|
415 |
+
)
|
416 |
+
|
417 |
+
def forward(self, x: torch.Tensor):
|
418 |
+
for block in self.resblocks:
|
419 |
+
x = block(x)
|
420 |
+
return x
|
421 |
+
|
422 |
+
|
423 |
+
class CrossAttentionDecoder(nn.Module):
|
424 |
+
|
425 |
+
def __init__(
|
426 |
+
self,
|
427 |
+
*,
|
428 |
+
num_latents: int,
|
429 |
+
out_channels: int,
|
430 |
+
fourier_embedder: FourierEmbedder,
|
431 |
+
width: int,
|
432 |
+
heads: int,
|
433 |
+
qkv_bias: bool = True,
|
434 |
+
qk_norm: bool = False,
|
435 |
+
label_type: str = "binary"
|
436 |
+
):
|
437 |
+
super().__init__()
|
438 |
+
|
439 |
+
self.fourier_embedder = fourier_embedder
|
440 |
+
|
441 |
+
self.query_proj = nn.Linear(self.fourier_embedder.out_dim, width)
|
442 |
+
|
443 |
+
self.cross_attn_decoder = ResidualCrossAttentionBlock(
|
444 |
+
n_data=num_latents,
|
445 |
+
width=width,
|
446 |
+
heads=heads,
|
447 |
+
qkv_bias=qkv_bias,
|
448 |
+
qk_norm=qk_norm
|
449 |
+
)
|
450 |
+
|
451 |
+
self.ln_post = nn.LayerNorm(width)
|
452 |
+
self.output_proj = nn.Linear(width, out_channels)
|
453 |
+
self.label_type = label_type
|
454 |
+
|
455 |
+
def forward(self, queries: torch.FloatTensor, latents: torch.FloatTensor):
|
456 |
+
queries = self.query_proj(self.fourier_embedder(queries).to(latents.dtype))
|
457 |
+
x = self.cross_attn_decoder(queries, latents)
|
458 |
+
x = self.ln_post(x)
|
459 |
+
occ = self.output_proj(x)
|
460 |
+
return occ
|
461 |
+
|
462 |
+
|
463 |
+
def generate_dense_grid_points(bbox_min: np.ndarray,
|
464 |
+
bbox_max: np.ndarray,
|
465 |
+
octree_depth: int,
|
466 |
+
indexing: str = "ij",
|
467 |
+
octree_resolution: int = None,
|
468 |
+
):
|
469 |
+
length = bbox_max - bbox_min
|
470 |
+
num_cells = np.exp2(octree_depth)
|
471 |
+
if octree_resolution is not None:
|
472 |
+
num_cells = octree_resolution
|
473 |
+
|
474 |
+
x = np.linspace(bbox_min[0], bbox_max[0], int(num_cells) + 1, dtype=np.float32)
|
475 |
+
y = np.linspace(bbox_min[1], bbox_max[1], int(num_cells) + 1, dtype=np.float32)
|
476 |
+
z = np.linspace(bbox_min[2], bbox_max[2], int(num_cells) + 1, dtype=np.float32)
|
477 |
+
[xs, ys, zs] = np.meshgrid(x, y, z, indexing=indexing)
|
478 |
+
xyz = np.stack((xs, ys, zs), axis=-1)
|
479 |
+
xyz = xyz.reshape(-1, 3)
|
480 |
+
grid_size = [int(num_cells) + 1, int(num_cells) + 1, int(num_cells) + 1]
|
481 |
+
|
482 |
+
return xyz, grid_size, length
|
483 |
+
|
484 |
+
|
485 |
+
def center_vertices(vertices):
|
486 |
+
"""Translate the vertices so that bounding box is centered at zero."""
|
487 |
+
vert_min = vertices.min(dim=0)[0]
|
488 |
+
vert_max = vertices.max(dim=0)[0]
|
489 |
+
vert_center = 0.5 * (vert_min + vert_max)
|
490 |
+
return vertices - vert_center
|
491 |
+
|
492 |
+
|
493 |
+
class Latent2MeshOutput:
|
494 |
+
|
495 |
+
def __init__(self, mesh_v=None, mesh_f=None):
|
496 |
+
self.mesh_v = mesh_v
|
497 |
+
self.mesh_f = mesh_f
|
498 |
+
|
499 |
+
|
500 |
+
class ShapeVAE(nn.Module):
|
501 |
+
def __init__(
|
502 |
+
self,
|
503 |
+
*,
|
504 |
+
num_latents: int,
|
505 |
+
embed_dim: int,
|
506 |
+
width: int,
|
507 |
+
heads: int,
|
508 |
+
num_decoder_layers: int,
|
509 |
+
num_freqs: int = 8,
|
510 |
+
include_pi: bool = True,
|
511 |
+
qkv_bias: bool = True,
|
512 |
+
qk_norm: bool = False,
|
513 |
+
label_type: str = "binary",
|
514 |
+
drop_path_rate: float = 0.0,
|
515 |
+
scale_factor: float = 1.0,
|
516 |
+
):
|
517 |
+
super().__init__()
|
518 |
+
self.fourier_embedder = FourierEmbedder(num_freqs=num_freqs, include_pi=include_pi)
|
519 |
+
|
520 |
+
self.post_kl = nn.Linear(embed_dim, width)
|
521 |
+
|
522 |
+
self.transformer = Transformer(
|
523 |
+
n_ctx=num_latents,
|
524 |
+
width=width,
|
525 |
+
layers=num_decoder_layers,
|
526 |
+
heads=heads,
|
527 |
+
qkv_bias=qkv_bias,
|
528 |
+
qk_norm=qk_norm,
|
529 |
+
drop_path_rate=drop_path_rate
|
530 |
+
)
|
531 |
+
|
532 |
+
self.geo_decoder = CrossAttentionDecoder(
|
533 |
+
fourier_embedder=self.fourier_embedder,
|
534 |
+
out_channels=1,
|
535 |
+
num_latents=num_latents,
|
536 |
+
width=width,
|
537 |
+
heads=heads,
|
538 |
+
qkv_bias=qkv_bias,
|
539 |
+
qk_norm=qk_norm,
|
540 |
+
label_type=label_type,
|
541 |
+
)
|
542 |
+
|
543 |
+
self.scale_factor = scale_factor
|
544 |
+
self.latent_shape = (num_latents, embed_dim)
|
545 |
+
|
546 |
+
def forward(self, latents):
|
547 |
+
latents = self.post_kl(latents)
|
548 |
+
latents = self.transformer(latents)
|
549 |
+
return latents
|
550 |
+
|
551 |
+
@torch.no_grad()
|
552 |
+
def latents2mesh(
|
553 |
+
self,
|
554 |
+
latents: torch.FloatTensor,
|
555 |
+
bounds: Union[Tuple[float], List[float], float] = 1.1,
|
556 |
+
octree_depth: int = 7,
|
557 |
+
num_chunks: int = 10000,
|
558 |
+
mc_level: float = -1 / 512,
|
559 |
+
octree_resolution: int = None,
|
560 |
+
mc_algo: str = 'dmc',
|
561 |
+
):
|
562 |
+
device = latents.device
|
563 |
+
|
564 |
+
# 1. generate query points
|
565 |
+
if isinstance(bounds, float):
|
566 |
+
bounds = [-bounds, -bounds, -bounds, bounds, bounds, bounds]
|
567 |
+
bbox_min = np.array(bounds[0:3])
|
568 |
+
bbox_max = np.array(bounds[3:6])
|
569 |
+
bbox_size = bbox_max - bbox_min
|
570 |
+
xyz_samples, grid_size, length = generate_dense_grid_points(
|
571 |
+
bbox_min=bbox_min,
|
572 |
+
bbox_max=bbox_max,
|
573 |
+
octree_depth=octree_depth,
|
574 |
+
octree_resolution=octree_resolution,
|
575 |
+
indexing="ij"
|
576 |
+
)
|
577 |
+
xyz_samples = torch.FloatTensor(xyz_samples)
|
578 |
+
|
579 |
+
# 2. latents to 3d volume
|
580 |
+
batch_logits = []
|
581 |
+
batch_size = latents.shape[0]
|
582 |
+
for start in tqdm(range(0, xyz_samples.shape[0], num_chunks),
|
583 |
+
desc=f"MC Level {mc_level} Implicit Function:"):
|
584 |
+
queries = xyz_samples[start: start + num_chunks, :].to(device)
|
585 |
+
queries = queries.half()
|
586 |
+
batch_queries = repeat(queries, "p c -> b p c", b=batch_size)
|
587 |
+
|
588 |
+
logits = self.geo_decoder(batch_queries.to(latents.dtype), latents)
|
589 |
+
if mc_level == -1:
|
590 |
+
mc_level = 0
|
591 |
+
logits = torch.sigmoid(logits) * 2 - 1
|
592 |
+
print(f'Training with soft labels, inference with sigmoid and marching cubes level 0.')
|
593 |
+
batch_logits.append(logits)
|
594 |
+
grid_logits = torch.cat(batch_logits, dim=1)
|
595 |
+
grid_logits = grid_logits.view((batch_size, grid_size[0], grid_size[1], grid_size[2])).float()
|
596 |
+
|
597 |
+
# 3. extract surface
|
598 |
+
outputs = []
|
599 |
+
for i in range(batch_size):
|
600 |
+
try:
|
601 |
+
if mc_algo == 'mc':
|
602 |
+
vertices, faces, normals, _ = measure.marching_cubes(
|
603 |
+
grid_logits[i].cpu().numpy(),
|
604 |
+
mc_level,
|
605 |
+
method="lewiner"
|
606 |
+
)
|
607 |
+
vertices = vertices / grid_size * bbox_size + bbox_min
|
608 |
+
elif mc_algo == 'dmc':
|
609 |
+
if not hasattr(self, 'dmc'):
|
610 |
+
try:
|
611 |
+
from diso import DiffDMC
|
612 |
+
except:
|
613 |
+
raise ImportError("Please install diso via `pip install diso`, or set mc_algo to 'mc'")
|
614 |
+
self.dmc = DiffDMC(dtype=torch.float32).to(device)
|
615 |
+
octree_resolution = 2 ** octree_depth if octree_resolution is None else octree_resolution
|
616 |
+
sdf = -grid_logits[i] / octree_resolution
|
617 |
+
verts, faces = self.dmc(sdf, deform=None, return_quads=False, normalize=True)
|
618 |
+
verts = center_vertices(verts)
|
619 |
+
vertices = verts.detach().cpu().numpy()
|
620 |
+
faces = faces.detach().cpu().numpy()[:, ::-1]
|
621 |
+
else:
|
622 |
+
raise ValueError(f"mc_algo {mc_algo} not supported.")
|
623 |
+
|
624 |
+
outputs.append(
|
625 |
+
Latent2MeshOutput(
|
626 |
+
mesh_v=vertices.astype(np.float32),
|
627 |
+
mesh_f=np.ascontiguousarray(faces)
|
628 |
+
)
|
629 |
+
)
|
630 |
+
|
631 |
+
except ValueError:
|
632 |
+
outputs.append(None)
|
633 |
+
except RuntimeError:
|
634 |
+
outputs.append(None)
|
635 |
+
|
636 |
+
return outputs
|
build/lib/hy3dgen/shapegen/pipelines.py
ADDED
@@ -0,0 +1,589 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Open Source Model Licensed under the Apache License Version 2.0
|
2 |
+
# and Other Licenses of the Third-Party Components therein:
|
3 |
+
# The below Model in this distribution may have been modified by THL A29 Limited
|
4 |
+
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
|
5 |
+
|
6 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
7 |
+
# The below software and/or models in this distribution may have been
|
8 |
+
# modified by THL A29 Limited ("Tencent Modifications").
|
9 |
+
# All Tencent Modifications are Copyright (C) THL A29 Limited.
|
10 |
+
|
11 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
12 |
+
# except for the third-party components listed below.
|
13 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
14 |
+
# in the repsective licenses of these third-party components.
|
15 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
16 |
+
# components and must ensure that the usage of the third party components adheres to
|
17 |
+
# all relevant laws and regulations.
|
18 |
+
|
19 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
20 |
+
# their software and algorithms, including trained model weights, parameters (including
|
21 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
22 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
23 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
24 |
+
|
25 |
+
import copy
|
26 |
+
import importlib
|
27 |
+
import inspect
|
28 |
+
import logging
|
29 |
+
import os
|
30 |
+
from typing import List, Optional, Union
|
31 |
+
|
32 |
+
import numpy as np
|
33 |
+
import torch
|
34 |
+
import trimesh
|
35 |
+
import yaml
|
36 |
+
from PIL import Image
|
37 |
+
from diffusers.utils.torch_utils import randn_tensor
|
38 |
+
from tqdm import tqdm
|
39 |
+
|
40 |
+
logger = logging.getLogger(__name__)
|
41 |
+
|
42 |
+
|
43 |
+
def retrieve_timesteps(
|
44 |
+
scheduler,
|
45 |
+
num_inference_steps: Optional[int] = None,
|
46 |
+
device: Optional[Union[str, torch.device]] = None,
|
47 |
+
timesteps: Optional[List[int]] = None,
|
48 |
+
sigmas: Optional[List[float]] = None,
|
49 |
+
**kwargs,
|
50 |
+
):
|
51 |
+
"""
|
52 |
+
Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
|
53 |
+
custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
|
54 |
+
|
55 |
+
Args:
|
56 |
+
scheduler (`SchedulerMixin`):
|
57 |
+
The scheduler to get timesteps from.
|
58 |
+
num_inference_steps (`int`):
|
59 |
+
The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
|
60 |
+
must be `None`.
|
61 |
+
device (`str` or `torch.device`, *optional*):
|
62 |
+
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
|
63 |
+
timesteps (`List[int]`, *optional*):
|
64 |
+
Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
|
65 |
+
`num_inference_steps` and `sigmas` must be `None`.
|
66 |
+
sigmas (`List[float]`, *optional*):
|
67 |
+
Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
|
68 |
+
`num_inference_steps` and `timesteps` must be `None`.
|
69 |
+
|
70 |
+
Returns:
|
71 |
+
`Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
|
72 |
+
second element is the number of inference steps.
|
73 |
+
"""
|
74 |
+
if timesteps is not None and sigmas is not None:
|
75 |
+
raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
|
76 |
+
if timesteps is not None:
|
77 |
+
accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
|
78 |
+
if not accepts_timesteps:
|
79 |
+
raise ValueError(
|
80 |
+
f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
|
81 |
+
f" timestep schedules. Please check whether you are using the correct scheduler."
|
82 |
+
)
|
83 |
+
scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
|
84 |
+
timesteps = scheduler.timesteps
|
85 |
+
num_inference_steps = len(timesteps)
|
86 |
+
elif sigmas is not None:
|
87 |
+
accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
|
88 |
+
if not accept_sigmas:
|
89 |
+
raise ValueError(
|
90 |
+
f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
|
91 |
+
f" sigmas schedules. Please check whether you are using the correct scheduler."
|
92 |
+
)
|
93 |
+
scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
|
94 |
+
timesteps = scheduler.timesteps
|
95 |
+
num_inference_steps = len(timesteps)
|
96 |
+
else:
|
97 |
+
scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
|
98 |
+
timesteps = scheduler.timesteps
|
99 |
+
return timesteps, num_inference_steps
|
100 |
+
|
101 |
+
|
102 |
+
def export_to_trimesh(mesh_output):
|
103 |
+
if isinstance(mesh_output, list):
|
104 |
+
outputs = []
|
105 |
+
for mesh in mesh_output:
|
106 |
+
if mesh is None:
|
107 |
+
outputs.append(None)
|
108 |
+
else:
|
109 |
+
mesh.mesh_f = mesh.mesh_f[:, ::-1]
|
110 |
+
mesh_output = trimesh.Trimesh(mesh.mesh_v, mesh.mesh_f)
|
111 |
+
outputs.append(mesh_output)
|
112 |
+
return outputs
|
113 |
+
else:
|
114 |
+
mesh_output.mesh_f = mesh_output.mesh_f[:, ::-1]
|
115 |
+
mesh_output = trimesh.Trimesh(mesh_output.mesh_v, mesh_output.mesh_f)
|
116 |
+
return mesh_output
|
117 |
+
|
118 |
+
|
119 |
+
def get_obj_from_str(string, reload=False):
|
120 |
+
module, cls = string.rsplit(".", 1)
|
121 |
+
if reload:
|
122 |
+
module_imp = importlib.import_module(module)
|
123 |
+
importlib.reload(module_imp)
|
124 |
+
return getattr(importlib.import_module(module, package=None), cls)
|
125 |
+
|
126 |
+
|
127 |
+
def instantiate_from_config(config, **kwargs):
|
128 |
+
if "target" not in config:
|
129 |
+
raise KeyError("Expected key `target` to instantiate.")
|
130 |
+
cls = get_obj_from_str(config["target"])
|
131 |
+
params = config.get("params", dict())
|
132 |
+
kwargs.update(params)
|
133 |
+
instance = cls(**kwargs)
|
134 |
+
return instance
|
135 |
+
|
136 |
+
|
137 |
+
class Hunyuan3DDiTPipeline:
|
138 |
+
@classmethod
|
139 |
+
def from_single_file(
|
140 |
+
cls,
|
141 |
+
ckpt_path,
|
142 |
+
config_path,
|
143 |
+
device='cpu',
|
144 |
+
dtype=torch.float16,
|
145 |
+
**kwargs,
|
146 |
+
):
|
147 |
+
# load config
|
148 |
+
with open(config_path, 'r') as f:
|
149 |
+
config = yaml.safe_load(f)
|
150 |
+
|
151 |
+
# load ckpt
|
152 |
+
if not os.path.exists(ckpt_path):
|
153 |
+
raise FileNotFoundError(f"Model file {ckpt_path} not found")
|
154 |
+
logger.info(f"Loading model from {ckpt_path}")
|
155 |
+
|
156 |
+
if ckpt_path.endswith('.safetensors'):
|
157 |
+
# parse safetensors
|
158 |
+
import safetensors.torch
|
159 |
+
safetensors_ckpt = safetensors.torch.load_file(ckpt_path, device='cpu')
|
160 |
+
ckpt = {}
|
161 |
+
for key, value in safetensors_ckpt.items():
|
162 |
+
model_name = key.split('.')[0]
|
163 |
+
new_key = key[len(model_name) + 1:]
|
164 |
+
if model_name not in ckpt:
|
165 |
+
ckpt[model_name] = {}
|
166 |
+
ckpt[model_name][new_key] = value
|
167 |
+
else:
|
168 |
+
ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=True)
|
169 |
+
|
170 |
+
# load model
|
171 |
+
from accelerate import init_empty_weights
|
172 |
+
with init_empty_weights():
|
173 |
+
model = instantiate_from_config(config['model'])
|
174 |
+
vae = instantiate_from_config(config['vae'])
|
175 |
+
conditioner = instantiate_from_config(config['conditioner'])
|
176 |
+
image_processor = instantiate_from_config(config['image_processor'])
|
177 |
+
scheduler = instantiate_from_config(config['scheduler'])
|
178 |
+
|
179 |
+
model.load_state_dict(ckpt['model'], assign = True)
|
180 |
+
vae.load_state_dict(ckpt['vae'], assign = True)
|
181 |
+
if 'conditioner' in ckpt:
|
182 |
+
conditioner.load_state_dict(ckpt['conditioner'], assign = True)
|
183 |
+
|
184 |
+
model_kwargs = dict(
|
185 |
+
vae=vae,
|
186 |
+
model=model,
|
187 |
+
scheduler=scheduler,
|
188 |
+
conditioner=conditioner,
|
189 |
+
image_processor=image_processor,
|
190 |
+
device=device,
|
191 |
+
dtype=dtype,
|
192 |
+
)
|
193 |
+
model_kwargs.update(kwargs)
|
194 |
+
|
195 |
+
return cls(
|
196 |
+
**model_kwargs
|
197 |
+
)
|
198 |
+
|
199 |
+
@classmethod
|
200 |
+
def from_pretrained(
|
201 |
+
cls,
|
202 |
+
model_path,
|
203 |
+
device='cuda',
|
204 |
+
dtype=torch.float16,
|
205 |
+
use_safetensors=None,
|
206 |
+
variant=None,
|
207 |
+
subfolder='hunyuan3d-dit-v2-0',
|
208 |
+
**kwargs,
|
209 |
+
):
|
210 |
+
original_model_path = model_path
|
211 |
+
if not os.path.exists(model_path):
|
212 |
+
# try local path
|
213 |
+
base_dir = os.environ.get('HY3DGEN_MODELS', '~/.cache/hy3dgen')
|
214 |
+
model_path = os.path.expanduser(os.path.join(base_dir, model_path, subfolder))
|
215 |
+
if not os.path.exists(model_path):
|
216 |
+
try:
|
217 |
+
import huggingface_hub
|
218 |
+
# download from huggingface
|
219 |
+
path = huggingface_hub.snapshot_download(repo_id=original_model_path)
|
220 |
+
model_path = os.path.join(path, subfolder)
|
221 |
+
except ImportError:
|
222 |
+
logger.warning(
|
223 |
+
"You need to install HuggingFace Hub to load models from the hub."
|
224 |
+
)
|
225 |
+
raise RuntimeError(f"Model path {model_path} not found")
|
226 |
+
if not os.path.exists(model_path):
|
227 |
+
raise FileNotFoundError(f"Model path {original_model_path} not found")
|
228 |
+
|
229 |
+
extension = 'ckpt' if not use_safetensors else 'safetensors'
|
230 |
+
variant = '' if variant is None else f'.{variant}'
|
231 |
+
ckpt_name = f'model{variant}.{extension}'
|
232 |
+
config_path = os.path.join(model_path, 'config.yaml')
|
233 |
+
ckpt_path = os.path.join(model_path, ckpt_name)
|
234 |
+
|
235 |
+
return cls.from_single_file(
|
236 |
+
ckpt_path,
|
237 |
+
config_path,
|
238 |
+
device=device,
|
239 |
+
dtype=dtype,
|
240 |
+
use_safetensors=use_safetensors,
|
241 |
+
variant=variant,
|
242 |
+
**kwargs
|
243 |
+
)
|
244 |
+
|
245 |
+
def __init__(
|
246 |
+
self,
|
247 |
+
vae,
|
248 |
+
model,
|
249 |
+
scheduler,
|
250 |
+
conditioner,
|
251 |
+
image_processor,
|
252 |
+
device='cuda',
|
253 |
+
dtype=torch.float16,
|
254 |
+
**kwargs
|
255 |
+
):
|
256 |
+
self.vae = vae
|
257 |
+
self.model = model
|
258 |
+
self.scheduler = scheduler
|
259 |
+
self.conditioner = conditioner
|
260 |
+
self.image_processor = image_processor
|
261 |
+
|
262 |
+
self.to(device, dtype)
|
263 |
+
|
264 |
+
def to(self, device=None, dtype=None):
|
265 |
+
if device is not None:
|
266 |
+
self.device = torch.device(device)
|
267 |
+
self.vae.to(device)
|
268 |
+
self.model.to(device)
|
269 |
+
self.conditioner.to(device)
|
270 |
+
if dtype is not None:
|
271 |
+
self.dtype = dtype
|
272 |
+
self.vae.to(dtype=dtype)
|
273 |
+
self.model.to(dtype=dtype)
|
274 |
+
self.conditioner.to(dtype=dtype)
|
275 |
+
|
276 |
+
def encode_cond(self, image, mask, do_classifier_free_guidance, dual_guidance):
|
277 |
+
bsz = image.shape[0]
|
278 |
+
cond = self.conditioner(image=image, mask=mask)
|
279 |
+
|
280 |
+
if do_classifier_free_guidance:
|
281 |
+
un_cond = self.conditioner.unconditional_embedding(bsz)
|
282 |
+
|
283 |
+
if dual_guidance:
|
284 |
+
un_cond_drop_main = copy.deepcopy(un_cond)
|
285 |
+
un_cond_drop_main['additional'] = cond['additional']
|
286 |
+
|
287 |
+
def cat_recursive(a, b, c):
|
288 |
+
if isinstance(a, torch.Tensor):
|
289 |
+
return torch.cat([a, b, c], dim=0).to(self.dtype)
|
290 |
+
out = {}
|
291 |
+
for k in a.keys():
|
292 |
+
out[k] = cat_recursive(a[k], b[k], c[k])
|
293 |
+
return out
|
294 |
+
|
295 |
+
cond = cat_recursive(cond, un_cond_drop_main, un_cond)
|
296 |
+
else:
|
297 |
+
un_cond = self.conditioner.unconditional_embedding(bsz)
|
298 |
+
|
299 |
+
def cat_recursive(a, b):
|
300 |
+
if isinstance(a, torch.Tensor):
|
301 |
+
return torch.cat([a, b], dim=0).to(self.dtype)
|
302 |
+
out = {}
|
303 |
+
for k in a.keys():
|
304 |
+
out[k] = cat_recursive(a[k], b[k])
|
305 |
+
return out
|
306 |
+
|
307 |
+
cond = cat_recursive(cond, un_cond)
|
308 |
+
return cond
|
309 |
+
|
310 |
+
def prepare_extra_step_kwargs(self, generator, eta):
|
311 |
+
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
|
312 |
+
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
|
313 |
+
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
|
314 |
+
# and should be between [0, 1]
|
315 |
+
|
316 |
+
accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
317 |
+
extra_step_kwargs = {}
|
318 |
+
if accepts_eta:
|
319 |
+
extra_step_kwargs["eta"] = eta
|
320 |
+
|
321 |
+
# check if the scheduler accepts generator
|
322 |
+
accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
323 |
+
if accepts_generator:
|
324 |
+
extra_step_kwargs["generator"] = generator
|
325 |
+
return extra_step_kwargs
|
326 |
+
|
327 |
+
def prepare_latents(self, batch_size, dtype, device, generator, latents=None):
|
328 |
+
shape = (batch_size, *self.vae.latent_shape)
|
329 |
+
if isinstance(generator, list) and len(generator) != batch_size:
|
330 |
+
raise ValueError(
|
331 |
+
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
332 |
+
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
333 |
+
)
|
334 |
+
|
335 |
+
if latents is None:
|
336 |
+
latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
|
337 |
+
else:
|
338 |
+
latents = latents.to(device)
|
339 |
+
|
340 |
+
# scale the initial noise by the standard deviation required by the scheduler
|
341 |
+
latents = latents * getattr(self.scheduler, 'init_noise_sigma', 1.0)
|
342 |
+
return latents
|
343 |
+
|
344 |
+
def prepare_image(self, image):
|
345 |
+
if isinstance(image, str) and not os.path.exists(image):
|
346 |
+
raise FileNotFoundError(f"Couldn't find image at path {image}")
|
347 |
+
|
348 |
+
if not isinstance(image, list):
|
349 |
+
image = [image]
|
350 |
+
image_pts = []
|
351 |
+
mask_pts = []
|
352 |
+
for img in image:
|
353 |
+
image_pt, mask_pt = self.image_processor(img, return_mask=True)
|
354 |
+
image_pts.append(image_pt)
|
355 |
+
mask_pts.append(mask_pt)
|
356 |
+
|
357 |
+
image_pts = torch.cat(image_pts, dim=0).to(self.device, dtype=self.dtype)
|
358 |
+
if mask_pts[0] is not None:
|
359 |
+
mask_pts = torch.cat(mask_pts, dim=0).to(self.device, dtype=self.dtype)
|
360 |
+
else:
|
361 |
+
mask_pts = None
|
362 |
+
return image_pts, mask_pts
|
363 |
+
|
364 |
+
def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
|
365 |
+
"""
|
366 |
+
See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
|
367 |
+
|
368 |
+
Args:
|
369 |
+
timesteps (`torch.Tensor`):
|
370 |
+
generate embedding vectors at these timesteps
|
371 |
+
embedding_dim (`int`, *optional*, defaults to 512):
|
372 |
+
dimension of the embeddings to generate
|
373 |
+
dtype:
|
374 |
+
data type of the generated embeddings
|
375 |
+
|
376 |
+
Returns:
|
377 |
+
`torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
|
378 |
+
"""
|
379 |
+
assert len(w.shape) == 1
|
380 |
+
w = w * 1000.0
|
381 |
+
|
382 |
+
half_dim = embedding_dim // 2
|
383 |
+
emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
|
384 |
+
emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
|
385 |
+
emb = w.to(dtype)[:, None] * emb[None, :]
|
386 |
+
emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
|
387 |
+
if embedding_dim % 2 == 1: # zero pad
|
388 |
+
emb = torch.nn.functional.pad(emb, (0, 1))
|
389 |
+
assert emb.shape == (w.shape[0], embedding_dim)
|
390 |
+
return emb
|
391 |
+
|
392 |
+
@torch.no_grad()
|
393 |
+
def __call__(
|
394 |
+
self,
|
395 |
+
image: Union[str, List[str], Image.Image] = None,
|
396 |
+
num_inference_steps: int = 50,
|
397 |
+
timesteps: List[int] = None,
|
398 |
+
sigmas: List[float] = None,
|
399 |
+
eta: float = 0.0,
|
400 |
+
guidance_scale: float = 7.5,
|
401 |
+
dual_guidance_scale: float = 10.5,
|
402 |
+
dual_guidance: bool = True,
|
403 |
+
generator=None,
|
404 |
+
box_v=1.01,
|
405 |
+
octree_resolution=384,
|
406 |
+
mc_level=-1 / 512,
|
407 |
+
num_chunks=8000,
|
408 |
+
mc_algo='mc',
|
409 |
+
output_type: Optional[str] = "trimesh",
|
410 |
+
enable_pbar=True,
|
411 |
+
**kwargs,
|
412 |
+
) -> List[List[trimesh.Trimesh]]:
|
413 |
+
callback = kwargs.pop("callback", None)
|
414 |
+
callback_steps = kwargs.pop("callback_steps", None)
|
415 |
+
|
416 |
+
device = self.device
|
417 |
+
dtype = self.dtype
|
418 |
+
do_classifier_free_guidance = guidance_scale >= 0 and \
|
419 |
+
getattr(self.model, 'guidance_cond_proj_dim', None) is None
|
420 |
+
dual_guidance = dual_guidance_scale >= 0 and dual_guidance
|
421 |
+
|
422 |
+
image, mask = self.prepare_image(image)
|
423 |
+
cond = self.encode_cond(image=image,
|
424 |
+
mask=mask,
|
425 |
+
do_classifier_free_guidance=do_classifier_free_guidance,
|
426 |
+
dual_guidance=dual_guidance)
|
427 |
+
batch_size = image.shape[0]
|
428 |
+
|
429 |
+
t_dtype = torch.long
|
430 |
+
timesteps, num_inference_steps = retrieve_timesteps(
|
431 |
+
self.scheduler, num_inference_steps, device, timesteps, sigmas)
|
432 |
+
|
433 |
+
latents = self.prepare_latents(batch_size, dtype, device, generator)
|
434 |
+
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
435 |
+
|
436 |
+
guidance_cond = None
|
437 |
+
if getattr(self.model, 'guidance_cond_proj_dim', None) is not None:
|
438 |
+
print('Using lcm guidance scale')
|
439 |
+
guidance_scale_tensor = torch.tensor(guidance_scale - 1).repeat(batch_size)
|
440 |
+
guidance_cond = self.get_guidance_scale_embedding(
|
441 |
+
guidance_scale_tensor, embedding_dim=self.model.guidance_cond_proj_dim
|
442 |
+
).to(device=device, dtype=latents.dtype)
|
443 |
+
|
444 |
+
for i, t in enumerate(tqdm(timesteps, disable=not enable_pbar, desc="Diffusion Sampling:", leave=False)):
|
445 |
+
# expand the latents if we are doing classifier free guidance
|
446 |
+
if do_classifier_free_guidance:
|
447 |
+
latent_model_input = torch.cat([latents] * (3 if dual_guidance else 2))
|
448 |
+
else:
|
449 |
+
latent_model_input = latents
|
450 |
+
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
451 |
+
|
452 |
+
# predict the noise residual
|
453 |
+
timestep_tensor = torch.tensor([t], dtype=t_dtype, device=device)
|
454 |
+
timestep_tensor = timestep_tensor.expand(latent_model_input.shape[0])
|
455 |
+
noise_pred = self.model(latent_model_input, timestep_tensor, cond, guidance_cond=guidance_cond)
|
456 |
+
|
457 |
+
# no drop, drop clip, all drop
|
458 |
+
if do_classifier_free_guidance:
|
459 |
+
if dual_guidance:
|
460 |
+
noise_pred_clip, noise_pred_dino, noise_pred_uncond = noise_pred.chunk(3)
|
461 |
+
noise_pred = (
|
462 |
+
noise_pred_uncond
|
463 |
+
+ guidance_scale * (noise_pred_clip - noise_pred_dino)
|
464 |
+
+ dual_guidance_scale * (noise_pred_dino - noise_pred_uncond)
|
465 |
+
)
|
466 |
+
else:
|
467 |
+
noise_pred_cond, noise_pred_uncond = noise_pred.chunk(2)
|
468 |
+
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
|
469 |
+
|
470 |
+
# compute the previous noisy sample x_t -> x_t-1
|
471 |
+
outputs = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)
|
472 |
+
latents = outputs.prev_sample
|
473 |
+
|
474 |
+
if callback is not None and i % callback_steps == 0:
|
475 |
+
step_idx = i // getattr(self.scheduler, "order", 1)
|
476 |
+
callback(step_idx, t, outputs)
|
477 |
+
|
478 |
+
return self._export(
|
479 |
+
latents,
|
480 |
+
output_type,
|
481 |
+
box_v, mc_level, num_chunks, octree_resolution, mc_algo,
|
482 |
+
)
|
483 |
+
|
484 |
+
def _export(self, latents, output_type, box_v, mc_level, num_chunks, octree_resolution, mc_algo):
|
485 |
+
if not output_type == "latent":
|
486 |
+
latents = 1. / self.vae.scale_factor * latents
|
487 |
+
latents = self.vae(latents)
|
488 |
+
outputs = self.vae.latents2mesh(
|
489 |
+
latents,
|
490 |
+
bounds=box_v,
|
491 |
+
mc_level=mc_level,
|
492 |
+
num_chunks=num_chunks,
|
493 |
+
octree_resolution=octree_resolution,
|
494 |
+
mc_algo=mc_algo,
|
495 |
+
)
|
496 |
+
else:
|
497 |
+
outputs = latents
|
498 |
+
|
499 |
+
if output_type == 'trimesh':
|
500 |
+
outputs = export_to_trimesh(outputs)
|
501 |
+
|
502 |
+
return outputs
|
503 |
+
|
504 |
+
|
505 |
+
class Hunyuan3DDiTFlowMatchingPipeline(Hunyuan3DDiTPipeline):
|
506 |
+
|
507 |
+
@torch.no_grad()
|
508 |
+
def __call__(
|
509 |
+
self,
|
510 |
+
image: Union[str, List[str], Image.Image] = None,
|
511 |
+
num_inference_steps: int = 50,
|
512 |
+
timesteps: List[int] = None,
|
513 |
+
sigmas: List[float] = None,
|
514 |
+
eta: float = 0.0,
|
515 |
+
guidance_scale: float = 7.5,
|
516 |
+
generator=None,
|
517 |
+
box_v=1.01,
|
518 |
+
octree_resolution=384,
|
519 |
+
mc_level=0.0,
|
520 |
+
mc_algo='mc',
|
521 |
+
num_chunks=8000,
|
522 |
+
output_type: Optional[str] = "trimesh",
|
523 |
+
enable_pbar=True,
|
524 |
+
**kwargs,
|
525 |
+
) -> List[List[trimesh.Trimesh]]:
|
526 |
+
callback = kwargs.pop("callback", None)
|
527 |
+
callback_steps = kwargs.pop("callback_steps", None)
|
528 |
+
|
529 |
+
device = self.device
|
530 |
+
dtype = self.dtype
|
531 |
+
do_classifier_free_guidance = guidance_scale >= 0 and not (
|
532 |
+
hasattr(self.model, 'guidance_embed') and
|
533 |
+
self.model.guidance_embed is True
|
534 |
+
)
|
535 |
+
|
536 |
+
image, mask = self.prepare_image(image)
|
537 |
+
cond = self.encode_cond(
|
538 |
+
image=image,
|
539 |
+
mask=mask,
|
540 |
+
do_classifier_free_guidance=do_classifier_free_guidance,
|
541 |
+
dual_guidance=False,
|
542 |
+
)
|
543 |
+
batch_size = image.shape[0]
|
544 |
+
|
545 |
+
# 5. Prepare timesteps
|
546 |
+
# NOTE: this is slightly different from common usage, we start from 0.
|
547 |
+
sigmas = np.linspace(0, 1, num_inference_steps) if sigmas is None else sigmas
|
548 |
+
timesteps, num_inference_steps = retrieve_timesteps(
|
549 |
+
self.scheduler,
|
550 |
+
num_inference_steps,
|
551 |
+
device,
|
552 |
+
sigmas=sigmas,
|
553 |
+
)
|
554 |
+
latents = self.prepare_latents(batch_size, dtype, device, generator)
|
555 |
+
|
556 |
+
guidance = None
|
557 |
+
if hasattr(self.model, 'guidance_embed') and \
|
558 |
+
self.model.guidance_embed is True:
|
559 |
+
guidance = torch.tensor([guidance_scale] * batch_size, device=device, dtype=dtype)
|
560 |
+
|
561 |
+
for i, t in enumerate(tqdm(timesteps, disable=not enable_pbar, desc="Diffusion Sampling:")):
|
562 |
+
# expand the latents if we are doing classifier free guidance
|
563 |
+
if do_classifier_free_guidance:
|
564 |
+
latent_model_input = torch.cat([latents] * 2)
|
565 |
+
else:
|
566 |
+
latent_model_input = latents
|
567 |
+
|
568 |
+
# NOTE: we assume model get timesteps ranged from 0 to 1
|
569 |
+
timestep = t.expand(latent_model_input.shape[0]).to(
|
570 |
+
latents.dtype) / self.scheduler.config.num_train_timesteps
|
571 |
+
noise_pred = self.model(latent_model_input, timestep, cond, guidance=guidance)
|
572 |
+
|
573 |
+
if do_classifier_free_guidance:
|
574 |
+
noise_pred_cond, noise_pred_uncond = noise_pred.chunk(2)
|
575 |
+
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
|
576 |
+
|
577 |
+
# compute the previous noisy sample x_t -> x_t-1
|
578 |
+
outputs = self.scheduler.step(noise_pred, t, latents)
|
579 |
+
latents = outputs.prev_sample
|
580 |
+
|
581 |
+
if callback is not None and i % callback_steps == 0:
|
582 |
+
step_idx = i // getattr(self.scheduler, "order", 1)
|
583 |
+
callback(step_idx, t, outputs)
|
584 |
+
|
585 |
+
return self._export(
|
586 |
+
latents,
|
587 |
+
output_type,
|
588 |
+
box_v, mc_level, num_chunks, octree_resolution, mc_algo,
|
589 |
+
)
|
build/lib/hy3dgen/shapegen/postprocessors.py
ADDED
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Open Source Model Licensed under the Apache License Version 2.0
|
2 |
+
# and Other Licenses of the Third-Party Components therein:
|
3 |
+
# The below Model in this distribution may have been modified by THL A29 Limited
|
4 |
+
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
|
5 |
+
|
6 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
7 |
+
# The below software and/or models in this distribution may have been
|
8 |
+
# modified by THL A29 Limited ("Tencent Modifications").
|
9 |
+
# All Tencent Modifications are Copyright (C) THL A29 Limited.
|
10 |
+
|
11 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
12 |
+
# except for the third-party components listed below.
|
13 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
14 |
+
# in the repsective licenses of these third-party components.
|
15 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
16 |
+
# components and must ensure that the usage of the third party components adheres to
|
17 |
+
# all relevant laws and regulations.
|
18 |
+
|
19 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
20 |
+
# their software and algorithms, including trained model weights, parameters (including
|
21 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
22 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
23 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
24 |
+
|
25 |
+
import os
|
26 |
+
import tempfile
|
27 |
+
from typing import Union
|
28 |
+
|
29 |
+
import pymeshlab
|
30 |
+
import trimesh
|
31 |
+
|
32 |
+
from .models.vae import Latent2MeshOutput
|
33 |
+
|
34 |
+
|
35 |
+
def load_mesh(path):
|
36 |
+
if path.endswith(".glb"):
|
37 |
+
mesh = trimesh.load(path)
|
38 |
+
else:
|
39 |
+
mesh = pymeshlab.MeshSet()
|
40 |
+
mesh.load_new_mesh(path)
|
41 |
+
return mesh
|
42 |
+
|
43 |
+
|
44 |
+
def reduce_face(mesh: pymeshlab.MeshSet, max_facenum: int = 200000):
|
45 |
+
mesh.apply_filter(
|
46 |
+
"meshing_decimation_quadric_edge_collapse",
|
47 |
+
targetfacenum=max_facenum,
|
48 |
+
qualitythr=1.0,
|
49 |
+
preserveboundary=True,
|
50 |
+
boundaryweight=3,
|
51 |
+
preservenormal=True,
|
52 |
+
preservetopology=True,
|
53 |
+
autoclean=True
|
54 |
+
)
|
55 |
+
return mesh
|
56 |
+
|
57 |
+
|
58 |
+
def remove_floater(mesh: pymeshlab.MeshSet):
|
59 |
+
mesh.apply_filter("compute_selection_by_small_disconnected_components_per_face",
|
60 |
+
nbfaceratio=0.005)
|
61 |
+
mesh.apply_filter("compute_selection_transfer_face_to_vertex", inclusive=False)
|
62 |
+
mesh.apply_filter("meshing_remove_selected_vertices_and_faces")
|
63 |
+
return mesh
|
64 |
+
|
65 |
+
|
66 |
+
def pymeshlab2trimesh(mesh: pymeshlab.MeshSet):
|
67 |
+
temp_file = tempfile.NamedTemporaryFile(suffix='.ply', delete=True)
|
68 |
+
temp_file.close()
|
69 |
+
temp_file_name = temp_file.name
|
70 |
+
|
71 |
+
mesh.save_current_mesh(temp_file_name)
|
72 |
+
mesh = trimesh.load(temp_file_name)
|
73 |
+
if os.path.exists(temp_file_name):
|
74 |
+
os.remove(temp_file_name)
|
75 |
+
|
76 |
+
# 检查加载的对象类型
|
77 |
+
if isinstance(mesh, trimesh.Scene):
|
78 |
+
combined_mesh = trimesh.Trimesh()
|
79 |
+
# 如果是Scene,遍历所有的geometry并合并
|
80 |
+
for geom in mesh.geometry.values():
|
81 |
+
combined_mesh = trimesh.util.concatenate([combined_mesh, geom])
|
82 |
+
mesh = combined_mesh
|
83 |
+
return mesh
|
84 |
+
|
85 |
+
|
86 |
+
def trimesh2pymeshlab(mesh: trimesh.Trimesh):
|
87 |
+
temp_file = tempfile.NamedTemporaryFile(suffix='.ply', delete=True)
|
88 |
+
temp_file.close()
|
89 |
+
temp_file_name = temp_file.name
|
90 |
+
|
91 |
+
if isinstance(mesh, trimesh.scene.Scene):
|
92 |
+
for idx, obj in enumerate(mesh.geometry.values()):
|
93 |
+
if idx == 0:
|
94 |
+
temp_mesh = obj
|
95 |
+
else:
|
96 |
+
temp_mesh = temp_mesh + obj
|
97 |
+
mesh = temp_mesh
|
98 |
+
mesh.export(temp_file_name)
|
99 |
+
mesh = pymeshlab.MeshSet()
|
100 |
+
mesh.load_new_mesh(temp_file_name)
|
101 |
+
if os.path.exists(temp_file_name):
|
102 |
+
os.remove(temp_file_name)
|
103 |
+
|
104 |
+
return mesh
|
105 |
+
|
106 |
+
|
107 |
+
def export_mesh(input, output):
|
108 |
+
if isinstance(input, pymeshlab.MeshSet):
|
109 |
+
mesh = output
|
110 |
+
elif isinstance(input, Latent2MeshOutput):
|
111 |
+
output = Latent2MeshOutput()
|
112 |
+
output.mesh_v = output.current_mesh().vertex_matrix()
|
113 |
+
output.mesh_f = output.current_mesh().face_matrix()
|
114 |
+
mesh = output
|
115 |
+
else:
|
116 |
+
mesh = pymeshlab2trimesh(output)
|
117 |
+
return mesh
|
118 |
+
|
119 |
+
|
120 |
+
def import_mesh(mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str]) -> pymeshlab.MeshSet:
|
121 |
+
if isinstance(mesh, str):
|
122 |
+
mesh = load_mesh(mesh)
|
123 |
+
elif isinstance(mesh, Latent2MeshOutput):
|
124 |
+
mesh = pymeshlab.MeshSet()
|
125 |
+
mesh_pymeshlab = pymeshlab.Mesh(vertex_matrix=mesh.mesh_v, face_matrix=mesh.mesh_f)
|
126 |
+
mesh.add_mesh(mesh_pymeshlab, "converted_mesh")
|
127 |
+
|
128 |
+
if isinstance(mesh, (trimesh.Trimesh, trimesh.scene.Scene)):
|
129 |
+
mesh = trimesh2pymeshlab(mesh)
|
130 |
+
|
131 |
+
return mesh
|
132 |
+
|
133 |
+
|
134 |
+
class FaceReducer:
|
135 |
+
def __call__(
|
136 |
+
self,
|
137 |
+
mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str],
|
138 |
+
max_facenum: int = 40000
|
139 |
+
) -> Union[pymeshlab.MeshSet, trimesh.Trimesh]:
|
140 |
+
ms = import_mesh(mesh)
|
141 |
+
ms = reduce_face(ms, max_facenum=max_facenum)
|
142 |
+
mesh = export_mesh(mesh, ms)
|
143 |
+
return mesh
|
144 |
+
|
145 |
+
|
146 |
+
class FloaterRemover:
|
147 |
+
def __call__(
|
148 |
+
self,
|
149 |
+
mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str],
|
150 |
+
) -> Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput]:
|
151 |
+
ms = import_mesh(mesh)
|
152 |
+
ms = remove_floater(ms)
|
153 |
+
mesh = export_mesh(mesh, ms)
|
154 |
+
return mesh
|
155 |
+
|
156 |
+
|
157 |
+
class DegenerateFaceRemover:
|
158 |
+
def __call__(
|
159 |
+
self,
|
160 |
+
mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str],
|
161 |
+
) -> Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput]:
|
162 |
+
ms = import_mesh(mesh)
|
163 |
+
|
164 |
+
temp_file = tempfile.NamedTemporaryFile(suffix='.ply', delete=True)
|
165 |
+
temp_file.close()
|
166 |
+
temp_file_name = temp_file.name
|
167 |
+
|
168 |
+
ms.save_current_mesh(temp_file_name)
|
169 |
+
ms = pymeshlab.MeshSet()
|
170 |
+
ms.load_new_mesh(temp_file_name)
|
171 |
+
if os.path.exists(temp_file_name):
|
172 |
+
os.remove(temp_file_name)
|
173 |
+
|
174 |
+
mesh = export_mesh(mesh, ms)
|
175 |
+
return mesh
|
build/lib/hy3dgen/shapegen/preprocessors.py
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Open Source Model Licensed under the Apache License Version 2.0
|
2 |
+
# and Other Licenses of the Third-Party Components therein:
|
3 |
+
# The below Model in this distribution may have been modified by THL A29 Limited
|
4 |
+
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
|
5 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
6 |
+
# The below software and/or models in this distribution may have been
|
7 |
+
# modified by THL A29 Limited ("Tencent Modifications").
|
8 |
+
# All Tencent Modifications are Copyright (C) THL A29 Limited.
|
9 |
+
|
10 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
11 |
+
# except for the third-party components listed below.
|
12 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
13 |
+
# in the repsective licenses of these third-party components.
|
14 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
15 |
+
# components and must ensure that the usage of the third party components adheres to
|
16 |
+
# all relevant laws and regulations.
|
17 |
+
|
18 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
19 |
+
# their software and algorithms, including trained model weights, parameters (including
|
20 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
21 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
22 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
23 |
+
|
24 |
+
import cv2
|
25 |
+
import numpy as np
|
26 |
+
import torch
|
27 |
+
from PIL import Image
|
28 |
+
from einops import repeat, rearrange
|
29 |
+
|
30 |
+
|
31 |
+
def array_to_tensor(np_array):
|
32 |
+
image_pt = torch.tensor(np_array).float()
|
33 |
+
image_pt = image_pt / 255 * 2 - 1
|
34 |
+
image_pt = rearrange(image_pt, "h w c -> c h w")
|
35 |
+
image_pts = repeat(image_pt, "c h w -> b c h w", b=1)
|
36 |
+
return image_pts
|
37 |
+
|
38 |
+
|
39 |
+
class ImageProcessorV2:
|
40 |
+
def __init__(self, size=512, border_ratio=None):
|
41 |
+
self.size = size
|
42 |
+
self.border_ratio = border_ratio
|
43 |
+
|
44 |
+
@staticmethod
|
45 |
+
def recenter(image, border_ratio: float = 0.2):
|
46 |
+
""" recenter an image to leave some empty space at the image border.
|
47 |
+
|
48 |
+
Args:
|
49 |
+
image (ndarray): input image, float/uint8 [H, W, 3/4]
|
50 |
+
mask (ndarray): alpha mask, bool [H, W]
|
51 |
+
border_ratio (float, optional): border ratio, image will be resized to (1 - border_ratio). Defaults to 0.2.
|
52 |
+
|
53 |
+
Returns:
|
54 |
+
ndarray: output image, float/uint8 [H, W, 3/4]
|
55 |
+
"""
|
56 |
+
|
57 |
+
if image.shape[-1] == 4:
|
58 |
+
mask = image[..., 3]
|
59 |
+
else:
|
60 |
+
mask = np.ones_like(image[..., 0:1]) * 255
|
61 |
+
image = np.concatenate([image, mask], axis=-1)
|
62 |
+
mask = mask[..., 0]
|
63 |
+
|
64 |
+
H, W, C = image.shape
|
65 |
+
|
66 |
+
size = max(H, W)
|
67 |
+
result = np.zeros((size, size, C), dtype=np.uint8)
|
68 |
+
|
69 |
+
coords = np.nonzero(mask)
|
70 |
+
x_min, x_max = coords[0].min(), coords[0].max()
|
71 |
+
y_min, y_max = coords[1].min(), coords[1].max()
|
72 |
+
h = x_max - x_min
|
73 |
+
w = y_max - y_min
|
74 |
+
if h == 0 or w == 0:
|
75 |
+
raise ValueError('input image is empty')
|
76 |
+
desired_size = int(size * (1 - border_ratio))
|
77 |
+
scale = desired_size / max(h, w)
|
78 |
+
h2 = int(h * scale)
|
79 |
+
w2 = int(w * scale)
|
80 |
+
x2_min = (size - h2) // 2
|
81 |
+
x2_max = x2_min + h2
|
82 |
+
|
83 |
+
y2_min = (size - w2) // 2
|
84 |
+
y2_max = y2_min + w2
|
85 |
+
|
86 |
+
result[x2_min:x2_max, y2_min:y2_max] = cv2.resize(image[x_min:x_max, y_min:y_max], (w2, h2),
|
87 |
+
interpolation=cv2.INTER_AREA)
|
88 |
+
|
89 |
+
bg = np.ones((result.shape[0], result.shape[1], 3), dtype=np.uint8) * 255
|
90 |
+
# bg = np.zeros((result.shape[0], result.shape[1], 3), dtype=np.uint8) * 255
|
91 |
+
mask = result[..., 3:].astype(np.float32) / 255
|
92 |
+
result = result[..., :3] * mask + bg * (1 - mask)
|
93 |
+
|
94 |
+
mask = mask * 255
|
95 |
+
result = result.clip(0, 255).astype(np.uint8)
|
96 |
+
mask = mask.clip(0, 255).astype(np.uint8)
|
97 |
+
return result, mask
|
98 |
+
|
99 |
+
def __call__(self, image, border_ratio=0.15, to_tensor=True, return_mask=False, **kwargs):
|
100 |
+
if self.border_ratio is not None:
|
101 |
+
border_ratio = self.border_ratio
|
102 |
+
print(f"Using border_ratio from init: {border_ratio}")
|
103 |
+
if isinstance(image, str):
|
104 |
+
image = cv2.imread(image, cv2.IMREAD_UNCHANGED)
|
105 |
+
image, mask = self.recenter(image, border_ratio=border_ratio)
|
106 |
+
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
107 |
+
elif isinstance(image, Image.Image):
|
108 |
+
image = np.asarray(image)
|
109 |
+
image, mask = self.recenter(image, border_ratio=border_ratio)
|
110 |
+
|
111 |
+
image = cv2.resize(image, (self.size, self.size), interpolation=cv2.INTER_CUBIC)
|
112 |
+
mask = cv2.resize(mask, (self.size, self.size), interpolation=cv2.INTER_NEAREST)
|
113 |
+
mask = mask[..., np.newaxis]
|
114 |
+
|
115 |
+
if to_tensor:
|
116 |
+
image = array_to_tensor(image)
|
117 |
+
mask = array_to_tensor(mask)
|
118 |
+
if return_mask:
|
119 |
+
return image, mask
|
120 |
+
return image
|
121 |
+
|
122 |
+
|
123 |
+
IMAGE_PROCESSORS = {
|
124 |
+
"v2": ImageProcessorV2,
|
125 |
+
}
|
126 |
+
|
127 |
+
DEFAULT_IMAGEPROCESSOR = 'v2'
|
build/lib/hy3dgen/shapegen/schedulers.py
ADDED
@@ -0,0 +1,307 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2024 Stability AI, Katherine Crowson and The HuggingFace Team. All rights reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
import math
|
16 |
+
from dataclasses import dataclass
|
17 |
+
from typing import List, Optional, Tuple, Union
|
18 |
+
|
19 |
+
import numpy as np
|
20 |
+
import torch
|
21 |
+
from diffusers.configuration_utils import ConfigMixin, register_to_config
|
22 |
+
from diffusers.schedulers.scheduling_utils import SchedulerMixin
|
23 |
+
from diffusers.utils import BaseOutput, logging
|
24 |
+
|
25 |
+
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
26 |
+
|
27 |
+
|
28 |
+
@dataclass
|
29 |
+
class FlowMatchEulerDiscreteSchedulerOutput(BaseOutput):
|
30 |
+
"""
|
31 |
+
Output class for the scheduler's `step` function output.
|
32 |
+
|
33 |
+
Args:
|
34 |
+
prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
|
35 |
+
Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
|
36 |
+
denoising loop.
|
37 |
+
"""
|
38 |
+
|
39 |
+
prev_sample: torch.FloatTensor
|
40 |
+
|
41 |
+
|
42 |
+
class FlowMatchEulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
43 |
+
"""
|
44 |
+
NOTE: this is very similar to diffusers.FlowMatchEulerDiscreteScheduler. Except our timesteps are reversed
|
45 |
+
|
46 |
+
Euler scheduler.
|
47 |
+
|
48 |
+
This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
|
49 |
+
methods the library implements for all schedulers such as loading and saving.
|
50 |
+
|
51 |
+
Args:
|
52 |
+
num_train_timesteps (`int`, defaults to 1000):
|
53 |
+
The number of diffusion steps to train the model.
|
54 |
+
timestep_spacing (`str`, defaults to `"linspace"`):
|
55 |
+
The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
|
56 |
+
Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
|
57 |
+
shift (`float`, defaults to 1.0):
|
58 |
+
The shift value for the timestep schedule.
|
59 |
+
"""
|
60 |
+
|
61 |
+
_compatibles = []
|
62 |
+
order = 1
|
63 |
+
|
64 |
+
@register_to_config
|
65 |
+
def __init__(
|
66 |
+
self,
|
67 |
+
num_train_timesteps: int = 1000,
|
68 |
+
shift: float = 1.0,
|
69 |
+
use_dynamic_shifting=False,
|
70 |
+
):
|
71 |
+
timesteps = np.linspace(1, num_train_timesteps, num_train_timesteps, dtype=np.float32).copy()
|
72 |
+
timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32)
|
73 |
+
|
74 |
+
sigmas = timesteps / num_train_timesteps
|
75 |
+
if not use_dynamic_shifting:
|
76 |
+
# when use_dynamic_shifting is True, we apply the timestep shifting on the fly based on the image resolution
|
77 |
+
sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
|
78 |
+
|
79 |
+
self.timesteps = sigmas * num_train_timesteps
|
80 |
+
|
81 |
+
self._step_index = None
|
82 |
+
self._begin_index = None
|
83 |
+
|
84 |
+
self.sigmas = sigmas.to("cpu") # to avoid too much CPU/GPU communication
|
85 |
+
self.sigma_min = self.sigmas[-1].item()
|
86 |
+
self.sigma_max = self.sigmas[0].item()
|
87 |
+
|
88 |
+
@property
|
89 |
+
def step_index(self):
|
90 |
+
"""
|
91 |
+
The index counter for current timestep. It will increase 1 after each scheduler step.
|
92 |
+
"""
|
93 |
+
return self._step_index
|
94 |
+
|
95 |
+
@property
|
96 |
+
def begin_index(self):
|
97 |
+
"""
|
98 |
+
The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
|
99 |
+
"""
|
100 |
+
return self._begin_index
|
101 |
+
|
102 |
+
# Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
|
103 |
+
def set_begin_index(self, begin_index: int = 0):
|
104 |
+
"""
|
105 |
+
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
106 |
+
|
107 |
+
Args:
|
108 |
+
begin_index (`int`):
|
109 |
+
The begin index for the scheduler.
|
110 |
+
"""
|
111 |
+
self._begin_index = begin_index
|
112 |
+
|
113 |
+
def scale_noise(
|
114 |
+
self,
|
115 |
+
sample: torch.FloatTensor,
|
116 |
+
timestep: Union[float, torch.FloatTensor],
|
117 |
+
noise: Optional[torch.FloatTensor] = None,
|
118 |
+
) -> torch.FloatTensor:
|
119 |
+
"""
|
120 |
+
Forward process in flow-matching
|
121 |
+
|
122 |
+
Args:
|
123 |
+
sample (`torch.FloatTensor`):
|
124 |
+
The input sample.
|
125 |
+
timestep (`int`, *optional*):
|
126 |
+
The current timestep in the diffusion chain.
|
127 |
+
|
128 |
+
Returns:
|
129 |
+
`torch.FloatTensor`:
|
130 |
+
A scaled input sample.
|
131 |
+
"""
|
132 |
+
# Make sure sigmas and timesteps have the same device and dtype as original_samples
|
133 |
+
sigmas = self.sigmas.to(device=sample.device, dtype=sample.dtype)
|
134 |
+
|
135 |
+
if sample.device.type == "mps" and torch.is_floating_point(timestep):
|
136 |
+
# mps does not support float64
|
137 |
+
schedule_timesteps = self.timesteps.to(sample.device, dtype=torch.float32)
|
138 |
+
timestep = timestep.to(sample.device, dtype=torch.float32)
|
139 |
+
else:
|
140 |
+
schedule_timesteps = self.timesteps.to(sample.device)
|
141 |
+
timestep = timestep.to(sample.device)
|
142 |
+
|
143 |
+
# self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index
|
144 |
+
if self.begin_index is None:
|
145 |
+
step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timestep]
|
146 |
+
elif self.step_index is not None:
|
147 |
+
# add_noise is called after first denoising step (for inpainting)
|
148 |
+
step_indices = [self.step_index] * timestep.shape[0]
|
149 |
+
else:
|
150 |
+
# add noise is called before first denoising step to create initial latent(img2img)
|
151 |
+
step_indices = [self.begin_index] * timestep.shape[0]
|
152 |
+
|
153 |
+
sigma = sigmas[step_indices].flatten()
|
154 |
+
while len(sigma.shape) < len(sample.shape):
|
155 |
+
sigma = sigma.unsqueeze(-1)
|
156 |
+
|
157 |
+
sample = sigma * noise + (1.0 - sigma) * sample
|
158 |
+
|
159 |
+
return sample
|
160 |
+
|
161 |
+
def _sigma_to_t(self, sigma):
|
162 |
+
return sigma * self.config.num_train_timesteps
|
163 |
+
|
164 |
+
def time_shift(self, mu: float, sigma: float, t: torch.Tensor):
|
165 |
+
return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
|
166 |
+
|
167 |
+
def set_timesteps(
|
168 |
+
self,
|
169 |
+
num_inference_steps: int = None,
|
170 |
+
device: Union[str, torch.device] = None,
|
171 |
+
sigmas: Optional[List[float]] = None,
|
172 |
+
mu: Optional[float] = None,
|
173 |
+
):
|
174 |
+
"""
|
175 |
+
Sets the discrete timesteps used for the diffusion chain (to be run before inference).
|
176 |
+
|
177 |
+
Args:
|
178 |
+
num_inference_steps (`int`):
|
179 |
+
The number of diffusion steps used when generating samples with a pre-trained model.
|
180 |
+
device (`str` or `torch.device`, *optional*):
|
181 |
+
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
|
182 |
+
"""
|
183 |
+
|
184 |
+
if self.config.use_dynamic_shifting and mu is None:
|
185 |
+
raise ValueError(" you have a pass a value for `mu` when `use_dynamic_shifting` is set to be `True`")
|
186 |
+
|
187 |
+
if sigmas is None:
|
188 |
+
self.num_inference_steps = num_inference_steps
|
189 |
+
timesteps = np.linspace(
|
190 |
+
self._sigma_to_t(self.sigma_max), self._sigma_to_t(self.sigma_min), num_inference_steps
|
191 |
+
)
|
192 |
+
|
193 |
+
sigmas = timesteps / self.config.num_train_timesteps
|
194 |
+
|
195 |
+
if self.config.use_dynamic_shifting:
|
196 |
+
sigmas = self.time_shift(mu, 1.0, sigmas)
|
197 |
+
else:
|
198 |
+
sigmas = self.config.shift * sigmas / (1 + (self.config.shift - 1) * sigmas)
|
199 |
+
|
200 |
+
sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device)
|
201 |
+
timesteps = sigmas * self.config.num_train_timesteps
|
202 |
+
|
203 |
+
self.timesteps = timesteps.to(device=device)
|
204 |
+
self.sigmas = torch.cat([sigmas, torch.ones(1, device=sigmas.device)])
|
205 |
+
|
206 |
+
self._step_index = None
|
207 |
+
self._begin_index = None
|
208 |
+
|
209 |
+
def index_for_timestep(self, timestep, schedule_timesteps=None):
|
210 |
+
if schedule_timesteps is None:
|
211 |
+
schedule_timesteps = self.timesteps
|
212 |
+
|
213 |
+
indices = (schedule_timesteps == timestep).nonzero()
|
214 |
+
|
215 |
+
# The sigma index that is taken for the **very** first `step`
|
216 |
+
# is always the second index (or the last index if there is only 1)
|
217 |
+
# This way we can ensure we don't accidentally skip a sigma in
|
218 |
+
# case we start in the middle of the denoising schedule (e.g. for image-to-image)
|
219 |
+
pos = 1 if len(indices) > 1 else 0
|
220 |
+
|
221 |
+
return indices[pos].item()
|
222 |
+
|
223 |
+
def _init_step_index(self, timestep):
|
224 |
+
if self.begin_index is None:
|
225 |
+
if isinstance(timestep, torch.Tensor):
|
226 |
+
timestep = timestep.to(self.timesteps.device)
|
227 |
+
self._step_index = self.index_for_timestep(timestep)
|
228 |
+
else:
|
229 |
+
self._step_index = self._begin_index
|
230 |
+
|
231 |
+
def step(
|
232 |
+
self,
|
233 |
+
model_output: torch.FloatTensor,
|
234 |
+
timestep: Union[float, torch.FloatTensor],
|
235 |
+
sample: torch.FloatTensor,
|
236 |
+
s_churn: float = 0.0,
|
237 |
+
s_tmin: float = 0.0,
|
238 |
+
s_tmax: float = float("inf"),
|
239 |
+
s_noise: float = 1.0,
|
240 |
+
generator: Optional[torch.Generator] = None,
|
241 |
+
return_dict: bool = True,
|
242 |
+
) -> Union[FlowMatchEulerDiscreteSchedulerOutput, Tuple]:
|
243 |
+
"""
|
244 |
+
Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
|
245 |
+
process from the learned model outputs (most often the predicted noise).
|
246 |
+
|
247 |
+
Args:
|
248 |
+
model_output (`torch.FloatTensor`):
|
249 |
+
The direct output from learned diffusion model.
|
250 |
+
timestep (`float`):
|
251 |
+
The current discrete timestep in the diffusion chain.
|
252 |
+
sample (`torch.FloatTensor`):
|
253 |
+
A current instance of a sample created by the diffusion process.
|
254 |
+
s_churn (`float`):
|
255 |
+
s_tmin (`float`):
|
256 |
+
s_tmax (`float`):
|
257 |
+
s_noise (`float`, defaults to 1.0):
|
258 |
+
Scaling factor for noise added to the sample.
|
259 |
+
generator (`torch.Generator`, *optional*):
|
260 |
+
A random number generator.
|
261 |
+
return_dict (`bool`):
|
262 |
+
Whether or not to return a [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or
|
263 |
+
tuple.
|
264 |
+
|
265 |
+
Returns:
|
266 |
+
[`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or `tuple`:
|
267 |
+
If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is
|
268 |
+
returned, otherwise a tuple is returned where the first element is the sample tensor.
|
269 |
+
"""
|
270 |
+
|
271 |
+
if (
|
272 |
+
isinstance(timestep, int)
|
273 |
+
or isinstance(timestep, torch.IntTensor)
|
274 |
+
or isinstance(timestep, torch.LongTensor)
|
275 |
+
):
|
276 |
+
raise ValueError(
|
277 |
+
(
|
278 |
+
"Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
|
279 |
+
" `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
|
280 |
+
" one of the `scheduler.timesteps` as a timestep."
|
281 |
+
),
|
282 |
+
)
|
283 |
+
|
284 |
+
if self.step_index is None:
|
285 |
+
self._init_step_index(timestep)
|
286 |
+
|
287 |
+
# Upcast to avoid precision issues when computing prev_sample
|
288 |
+
sample = sample.to(torch.float32)
|
289 |
+
|
290 |
+
sigma = self.sigmas[self.step_index]
|
291 |
+
sigma_next = self.sigmas[self.step_index + 1]
|
292 |
+
|
293 |
+
prev_sample = sample + (sigma_next - sigma) * model_output
|
294 |
+
|
295 |
+
# Cast sample back to model compatible dtype
|
296 |
+
prev_sample = prev_sample.to(model_output.dtype)
|
297 |
+
|
298 |
+
# upon completion increase step index by one
|
299 |
+
self._step_index += 1
|
300 |
+
|
301 |
+
if not return_dict:
|
302 |
+
return (prev_sample,)
|
303 |
+
|
304 |
+
return FlowMatchEulerDiscreteSchedulerOutput(prev_sample=prev_sample)
|
305 |
+
|
306 |
+
def __len__(self):
|
307 |
+
return self.config.num_train_timesteps
|
build/lib/hy3dgen/texgen/__init__.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Open Source Model Licensed under the Apache License Version 2.0
|
2 |
+
# and Other Licenses of the Third-Party Components therein:
|
3 |
+
# The below Model in this distribution may have been modified by THL A29 Limited
|
4 |
+
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
|
5 |
+
|
6 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
7 |
+
# The below software and/or models in this distribution may have been
|
8 |
+
# modified by THL A29 Limited ("Tencent Modifications").
|
9 |
+
# All Tencent Modifications are Copyright (C) THL A29 Limited.
|
10 |
+
|
11 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
12 |
+
# except for the third-party components listed below.
|
13 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
14 |
+
# in the repsective licenses of these third-party components.
|
15 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
16 |
+
# components and must ensure that the usage of the third party components adheres to
|
17 |
+
# all relevant laws and regulations.
|
18 |
+
|
19 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
20 |
+
# their software and algorithms, including trained model weights, parameters (including
|
21 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
22 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
23 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
24 |
+
|
25 |
+
|
26 |
+
from .pipelines import Hunyuan3DPaintPipeline, Hunyuan3DTexGenConfig
|
build/lib/hy3dgen/texgen/differentiable_renderer/__init__.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Open Source Model Licensed under the Apache License Version 2.0
|
2 |
+
# and Other Licenses of the Third-Party Components therein:
|
3 |
+
# The below Model in this distribution may have been modified by THL A29 Limited
|
4 |
+
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
|
5 |
+
|
6 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
7 |
+
# The below software and/or models in this distribution may have been
|
8 |
+
# modified by THL A29 Limited ("Tencent Modifications").
|
9 |
+
# All Tencent Modifications are Copyright (C) THL A29 Limited.
|
10 |
+
|
11 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
12 |
+
# except for the third-party components listed below.
|
13 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
14 |
+
# in the repsective licenses of these third-party components.
|
15 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
16 |
+
# components and must ensure that the usage of the third party components adheres to
|
17 |
+
# all relevant laws and regulations.
|
18 |
+
|
19 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
20 |
+
# their software and algorithms, including trained model weights, parameters (including
|
21 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
22 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
23 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
build/lib/hy3dgen/texgen/differentiable_renderer/camera_utils.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Open Source Model Licensed under the Apache License Version 2.0
|
2 |
+
# and Other Licenses of the Third-Party Components therein:
|
3 |
+
# The below Model in this distribution may have been modified by THL A29 Limited
|
4 |
+
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
|
5 |
+
|
6 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
7 |
+
# The below software and/or models in this distribution may have been
|
8 |
+
# modified by THL A29 Limited ("Tencent Modifications").
|
9 |
+
# All Tencent Modifications are Copyright (C) THL A29 Limited.
|
10 |
+
|
11 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
12 |
+
# except for the third-party components listed below.
|
13 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
14 |
+
# in the repsective licenses of these third-party components.
|
15 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
16 |
+
# components and must ensure that the usage of the third party components adheres to
|
17 |
+
# all relevant laws and regulations.
|
18 |
+
|
19 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
20 |
+
# their software and algorithms, including trained model weights, parameters (including
|
21 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
22 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
23 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
24 |
+
|
25 |
+
import math
|
26 |
+
|
27 |
+
import numpy as np
|
28 |
+
import torch
|
29 |
+
|
30 |
+
|
31 |
+
def transform_pos(mtx, pos, keepdim=False):
|
32 |
+
t_mtx = torch.from_numpy(mtx).to(
|
33 |
+
pos.device) if isinstance(
|
34 |
+
mtx, np.ndarray) else mtx
|
35 |
+
if pos.shape[-1] == 3:
|
36 |
+
posw = torch.cat(
|
37 |
+
[pos, torch.ones([pos.shape[0], 1]).to(pos.device)], axis=1)
|
38 |
+
else:
|
39 |
+
posw = pos
|
40 |
+
|
41 |
+
if keepdim:
|
42 |
+
return torch.matmul(posw, t_mtx.t())[...]
|
43 |
+
else:
|
44 |
+
return torch.matmul(posw, t_mtx.t())[None, ...]
|
45 |
+
|
46 |
+
|
47 |
+
def get_mv_matrix(elev, azim, camera_distance, center=None):
|
48 |
+
elev = -elev
|
49 |
+
azim += 90
|
50 |
+
|
51 |
+
elev_rad = math.radians(elev)
|
52 |
+
azim_rad = math.radians(azim)
|
53 |
+
|
54 |
+
camera_position = np.array([camera_distance * math.cos(elev_rad) * math.cos(azim_rad),
|
55 |
+
camera_distance *
|
56 |
+
math.cos(elev_rad) * math.sin(azim_rad),
|
57 |
+
camera_distance * math.sin(elev_rad)])
|
58 |
+
|
59 |
+
if center is None:
|
60 |
+
center = np.array([0, 0, 0])
|
61 |
+
else:
|
62 |
+
center = np.array(center)
|
63 |
+
|
64 |
+
lookat = center - camera_position
|
65 |
+
lookat = lookat / np.linalg.norm(lookat)
|
66 |
+
|
67 |
+
up = np.array([0, 0, 1.0])
|
68 |
+
right = np.cross(lookat, up)
|
69 |
+
right = right / np.linalg.norm(right)
|
70 |
+
up = np.cross(right, lookat)
|
71 |
+
up = up / np.linalg.norm(up)
|
72 |
+
|
73 |
+
c2w = np.concatenate(
|
74 |
+
[np.stack([right, up, -lookat], axis=-1), camera_position[:, None]], axis=-1)
|
75 |
+
|
76 |
+
w2c = np.zeros((4, 4))
|
77 |
+
w2c[:3, :3] = np.transpose(c2w[:3, :3], (1, 0))
|
78 |
+
w2c[:3, 3:] = -np.matmul(np.transpose(c2w[:3, :3], (1, 0)), c2w[:3, 3:])
|
79 |
+
w2c[3, 3] = 1.0
|
80 |
+
|
81 |
+
return w2c.astype(np.float32)
|
82 |
+
|
83 |
+
|
84 |
+
def get_orthographic_projection_matrix(
|
85 |
+
left=-1, right=1, bottom=-1, top=1, near=0, far=2):
|
86 |
+
"""
|
87 |
+
计算正交投影矩阵。
|
88 |
+
|
89 |
+
参数:
|
90 |
+
left (float): 投影区域左侧边界。
|
91 |
+
right (float): 投影区域右侧边界。
|
92 |
+
bottom (float): 投影区域底部边界。
|
93 |
+
top (float): 投影区域顶部边界。
|
94 |
+
near (float): 投影区域近裁剪面距离。
|
95 |
+
far (float): 投影区域远裁剪面距离。
|
96 |
+
|
97 |
+
返回:
|
98 |
+
numpy.ndarray: 正交投影矩阵。
|
99 |
+
"""
|
100 |
+
ortho_matrix = np.eye(4, dtype=np.float32)
|
101 |
+
ortho_matrix[0, 0] = 2 / (right - left)
|
102 |
+
ortho_matrix[1, 1] = 2 / (top - bottom)
|
103 |
+
ortho_matrix[2, 2] = -2 / (far - near)
|
104 |
+
ortho_matrix[0, 3] = -(right + left) / (right - left)
|
105 |
+
ortho_matrix[1, 3] = -(top + bottom) / (top - bottom)
|
106 |
+
ortho_matrix[2, 3] = -(far + near) / (far - near)
|
107 |
+
return ortho_matrix
|
108 |
+
|
109 |
+
|
110 |
+
def get_perspective_projection_matrix(fovy, aspect_wh, near, far):
|
111 |
+
fovy_rad = math.radians(fovy)
|
112 |
+
return np.array([[1.0 / (math.tan(fovy_rad / 2.0) * aspect_wh), 0, 0, 0],
|
113 |
+
[0, 1.0 / math.tan(fovy_rad / 2.0), 0, 0],
|
114 |
+
[0, 0, -(far + near) / (far - near), -
|
115 |
+
2.0 * far * near / (far - near)],
|
116 |
+
[0, 0, -1, 0]]).astype(np.float32)
|
build/lib/hy3dgen/texgen/differentiable_renderer/mesh_processor.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
|
3 |
+
def meshVerticeInpaint_smooth(texture, mask, vtx_pos, vtx_uv, pos_idx, uv_idx):
|
4 |
+
texture_height, texture_width, texture_channel = texture.shape
|
5 |
+
vtx_num = vtx_pos.shape[0]
|
6 |
+
|
7 |
+
vtx_mask = np.zeros(vtx_num, dtype=np.float32)
|
8 |
+
vtx_color = [np.zeros(texture_channel, dtype=np.float32) for _ in range(vtx_num)]
|
9 |
+
uncolored_vtxs = []
|
10 |
+
G = [[] for _ in range(vtx_num)]
|
11 |
+
|
12 |
+
for i in range(uv_idx.shape[0]):
|
13 |
+
for k in range(3):
|
14 |
+
vtx_uv_idx = uv_idx[i, k]
|
15 |
+
vtx_idx = pos_idx[i, k]
|
16 |
+
uv_v = int(round(vtx_uv[vtx_uv_idx, 0] * (texture_width - 1)))
|
17 |
+
uv_u = int(round((1.0 - vtx_uv[vtx_uv_idx, 1]) * (texture_height - 1)))
|
18 |
+
if mask[uv_u, uv_v] > 0:
|
19 |
+
vtx_mask[vtx_idx] = 1.0
|
20 |
+
vtx_color[vtx_idx] = texture[uv_u, uv_v]
|
21 |
+
else:
|
22 |
+
uncolored_vtxs.append(vtx_idx)
|
23 |
+
G[pos_idx[i, k]].append(pos_idx[i, (k + 1) % 3])
|
24 |
+
|
25 |
+
smooth_count = 2
|
26 |
+
last_uncolored_vtx_count = 0
|
27 |
+
while smooth_count > 0:
|
28 |
+
uncolored_vtx_count = 0
|
29 |
+
for vtx_idx in uncolored_vtxs:
|
30 |
+
sum_color = np.zeros(texture_channel, dtype=np.float32)
|
31 |
+
total_weight = 0.0
|
32 |
+
vtx_0 = vtx_pos[vtx_idx]
|
33 |
+
for connected_idx in G[vtx_idx]:
|
34 |
+
if vtx_mask[connected_idx] > 0:
|
35 |
+
vtx1 = vtx_pos[connected_idx]
|
36 |
+
dist = np.sqrt(np.sum((vtx_0 - vtx1) ** 2))
|
37 |
+
dist_weight = 1.0 / max(dist, 1e-4)
|
38 |
+
dist_weight *= dist_weight
|
39 |
+
sum_color += vtx_color[connected_idx] * dist_weight
|
40 |
+
total_weight += dist_weight
|
41 |
+
if total_weight > 0:
|
42 |
+
vtx_color[vtx_idx] = sum_color / total_weight
|
43 |
+
vtx_mask[vtx_idx] = 1.0
|
44 |
+
else:
|
45 |
+
uncolored_vtx_count += 1
|
46 |
+
|
47 |
+
if last_uncolored_vtx_count == uncolored_vtx_count:
|
48 |
+
smooth_count -= 1
|
49 |
+
else:
|
50 |
+
smooth_count += 1
|
51 |
+
last_uncolored_vtx_count = uncolored_vtx_count
|
52 |
+
|
53 |
+
new_texture = texture.copy()
|
54 |
+
new_mask = mask.copy()
|
55 |
+
for face_idx in range(uv_idx.shape[0]):
|
56 |
+
for k in range(3):
|
57 |
+
vtx_uv_idx = uv_idx[face_idx, k]
|
58 |
+
vtx_idx = pos_idx[face_idx, k]
|
59 |
+
if vtx_mask[vtx_idx] == 1.0:
|
60 |
+
uv_v = int(round(vtx_uv[vtx_uv_idx, 0] * (texture_width - 1)))
|
61 |
+
uv_u = int(round((1.0 - vtx_uv[vtx_uv_idx, 1]) * (texture_height - 1)))
|
62 |
+
new_texture[uv_u, uv_v] = vtx_color[vtx_idx]
|
63 |
+
new_mask[uv_u, uv_v] = 255
|
64 |
+
return new_texture, new_mask
|
65 |
+
|
66 |
+
def meshVerticeInpaint(texture, mask, vtx_pos, vtx_uv, pos_idx, uv_idx, method="smooth"):
|
67 |
+
if method == "smooth":
|
68 |
+
return meshVerticeInpaint_smooth(texture, mask, vtx_pos, vtx_uv, pos_idx, uv_idx)
|
69 |
+
else:
|
70 |
+
raise ValueError("Invalid method. Use 'smooth' or 'forward'.")
|
build/lib/hy3dgen/texgen/differentiable_renderer/mesh_render.py
ADDED
@@ -0,0 +1,833 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Open Source Model Licensed under the Apache License Version 2.0
|
2 |
+
# and Other Licenses of the Third-Party Components therein:
|
3 |
+
# The below Model in this distribution may have been modified by THL A29 Limited
|
4 |
+
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
|
5 |
+
|
6 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
7 |
+
# The below software and/or models in this distribution may have been
|
8 |
+
# modified by THL A29 Limited ("Tencent Modifications").
|
9 |
+
# All Tencent Modifications are Copyright (C) THL A29 Limited.
|
10 |
+
|
11 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
12 |
+
# except for the third-party components listed below.
|
13 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
14 |
+
# in the repsective licenses of these third-party components.
|
15 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
16 |
+
# components and must ensure that the usage of the third party components adheres to
|
17 |
+
# all relevant laws and regulations.
|
18 |
+
|
19 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
20 |
+
# their software and algorithms, including trained model weights, parameters (including
|
21 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
22 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
23 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
24 |
+
|
25 |
+
import cv2
|
26 |
+
import numpy as np
|
27 |
+
import torch
|
28 |
+
import torch.nn.functional as F
|
29 |
+
import trimesh
|
30 |
+
from PIL import Image
|
31 |
+
|
32 |
+
from .camera_utils import (
|
33 |
+
transform_pos,
|
34 |
+
get_mv_matrix,
|
35 |
+
get_orthographic_projection_matrix,
|
36 |
+
get_perspective_projection_matrix,
|
37 |
+
)
|
38 |
+
from .mesh_processor import meshVerticeInpaint
|
39 |
+
from .mesh_utils import load_mesh, save_mesh
|
40 |
+
|
41 |
+
|
42 |
+
def stride_from_shape(shape):
|
43 |
+
stride = [1]
|
44 |
+
for x in reversed(shape[1:]):
|
45 |
+
stride.append(stride[-1] * x)
|
46 |
+
return list(reversed(stride))
|
47 |
+
|
48 |
+
|
49 |
+
def scatter_add_nd_with_count(input, count, indices, values, weights=None):
|
50 |
+
# input: [..., C], D dimension + C channel
|
51 |
+
# count: [..., 1], D dimension
|
52 |
+
# indices: [N, D], long
|
53 |
+
# values: [N, C]
|
54 |
+
|
55 |
+
D = indices.shape[-1]
|
56 |
+
C = input.shape[-1]
|
57 |
+
size = input.shape[:-1]
|
58 |
+
stride = stride_from_shape(size)
|
59 |
+
|
60 |
+
assert len(size) == D
|
61 |
+
|
62 |
+
input = input.view(-1, C) # [HW, C]
|
63 |
+
count = count.view(-1, 1)
|
64 |
+
|
65 |
+
flatten_indices = (indices * torch.tensor(stride,
|
66 |
+
dtype=torch.long, device=indices.device)).sum(-1) # [N]
|
67 |
+
|
68 |
+
if weights is None:
|
69 |
+
weights = torch.ones_like(values[..., :1])
|
70 |
+
|
71 |
+
input.scatter_add_(0, flatten_indices.unsqueeze(1).repeat(1, C), values)
|
72 |
+
count.scatter_add_(0, flatten_indices.unsqueeze(1), weights)
|
73 |
+
|
74 |
+
return input.view(*size, C), count.view(*size, 1)
|
75 |
+
|
76 |
+
|
77 |
+
def linear_grid_put_2d(H, W, coords, values, return_count=False):
|
78 |
+
# coords: [N, 2], float in [0, 1]
|
79 |
+
# values: [N, C]
|
80 |
+
|
81 |
+
C = values.shape[-1]
|
82 |
+
|
83 |
+
indices = coords * torch.tensor(
|
84 |
+
[H - 1, W - 1], dtype=torch.float32, device=coords.device
|
85 |
+
)
|
86 |
+
indices_00 = indices.floor().long() # [N, 2]
|
87 |
+
indices_00[:, 0].clamp_(0, H - 2)
|
88 |
+
indices_00[:, 1].clamp_(0, W - 2)
|
89 |
+
indices_01 = indices_00 + torch.tensor(
|
90 |
+
[0, 1], dtype=torch.long, device=indices.device
|
91 |
+
)
|
92 |
+
indices_10 = indices_00 + torch.tensor(
|
93 |
+
[1, 0], dtype=torch.long, device=indices.device
|
94 |
+
)
|
95 |
+
indices_11 = indices_00 + torch.tensor(
|
96 |
+
[1, 1], dtype=torch.long, device=indices.device
|
97 |
+
)
|
98 |
+
|
99 |
+
h = indices[..., 0] - indices_00[..., 0].float()
|
100 |
+
w = indices[..., 1] - indices_00[..., 1].float()
|
101 |
+
w_00 = (1 - h) * (1 - w)
|
102 |
+
w_01 = (1 - h) * w
|
103 |
+
w_10 = h * (1 - w)
|
104 |
+
w_11 = h * w
|
105 |
+
|
106 |
+
result = torch.zeros(H, W, C, device=values.device,
|
107 |
+
dtype=values.dtype) # [H, W, C]
|
108 |
+
count = torch.zeros(H, W, 1, device=values.device,
|
109 |
+
dtype=values.dtype) # [H, W, 1]
|
110 |
+
weights = torch.ones_like(values[..., :1]) # [N, 1]
|
111 |
+
|
112 |
+
result, count = scatter_add_nd_with_count(
|
113 |
+
result, count, indices_00, values * w_00.unsqueeze(1), weights * w_00.unsqueeze(1))
|
114 |
+
result, count = scatter_add_nd_with_count(
|
115 |
+
result, count, indices_01, values * w_01.unsqueeze(1), weights * w_01.unsqueeze(1))
|
116 |
+
result, count = scatter_add_nd_with_count(
|
117 |
+
result, count, indices_10, values * w_10.unsqueeze(1), weights * w_10.unsqueeze(1))
|
118 |
+
result, count = scatter_add_nd_with_count(
|
119 |
+
result, count, indices_11, values * w_11.unsqueeze(1), weights * w_11.unsqueeze(1))
|
120 |
+
|
121 |
+
if return_count:
|
122 |
+
return result, count
|
123 |
+
|
124 |
+
mask = (count.squeeze(-1) > 0)
|
125 |
+
result[mask] = result[mask] / count[mask].repeat(1, C)
|
126 |
+
|
127 |
+
return result
|
128 |
+
|
129 |
+
|
130 |
+
class MeshRender():
|
131 |
+
def __init__(
|
132 |
+
self,
|
133 |
+
camera_distance=1.45, camera_type='orth',
|
134 |
+
default_resolution=1024, texture_size=1024,
|
135 |
+
use_antialias=True, max_mip_level=None, filter_mode='linear',
|
136 |
+
bake_mode='linear', raster_mode='cr', device='cuda'):
|
137 |
+
|
138 |
+
self.device = device
|
139 |
+
|
140 |
+
self.set_default_render_resolution(default_resolution)
|
141 |
+
self.set_default_texture_resolution(texture_size)
|
142 |
+
|
143 |
+
self.camera_distance = camera_distance
|
144 |
+
self.use_antialias = use_antialias
|
145 |
+
self.max_mip_level = max_mip_level
|
146 |
+
self.filter_mode = filter_mode
|
147 |
+
|
148 |
+
self.bake_angle_thres = 75
|
149 |
+
self.bake_unreliable_kernel_size = int(
|
150 |
+
(2 / 512) * max(self.default_resolution[0], self.default_resolution[1]))
|
151 |
+
self.bake_mode = bake_mode
|
152 |
+
|
153 |
+
self.raster_mode = raster_mode
|
154 |
+
if self.raster_mode == 'cr':
|
155 |
+
import custom_rasterizer as cr
|
156 |
+
self.raster = cr
|
157 |
+
else:
|
158 |
+
raise f'No raster named {self.raster_mode}'
|
159 |
+
|
160 |
+
if camera_type == 'orth':
|
161 |
+
self.ortho_scale = 1.2
|
162 |
+
self.camera_proj_mat = get_orthographic_projection_matrix(
|
163 |
+
left=-self.ortho_scale * 0.5, right=self.ortho_scale * 0.5,
|
164 |
+
bottom=-self.ortho_scale * 0.5, top=self.ortho_scale * 0.5,
|
165 |
+
near=0.1, far=100
|
166 |
+
)
|
167 |
+
elif camera_type == 'perspective':
|
168 |
+
self.camera_proj_mat = get_perspective_projection_matrix(
|
169 |
+
49.13, self.default_resolution[1] / self.default_resolution[0],
|
170 |
+
0.01, 100.0
|
171 |
+
)
|
172 |
+
else:
|
173 |
+
raise f'No camera type {camera_type}'
|
174 |
+
|
175 |
+
def raster_rasterize(self, pos, tri, resolution, ranges=None, grad_db=True):
|
176 |
+
|
177 |
+
if self.raster_mode == 'cr':
|
178 |
+
rast_out_db = None
|
179 |
+
if pos.dim() == 2:
|
180 |
+
pos = pos.unsqueeze(0)
|
181 |
+
findices, barycentric = self.raster.rasterize(pos, tri, resolution)
|
182 |
+
rast_out = torch.cat((barycentric, findices.unsqueeze(-1)), dim=-1)
|
183 |
+
rast_out = rast_out.unsqueeze(0)
|
184 |
+
else:
|
185 |
+
raise f'No raster named {self.raster_mode}'
|
186 |
+
|
187 |
+
return rast_out, rast_out_db
|
188 |
+
|
189 |
+
def raster_interpolate(self, uv, rast_out, uv_idx, rast_db=None, diff_attrs=None):
|
190 |
+
|
191 |
+
if self.raster_mode == 'cr':
|
192 |
+
textd = None
|
193 |
+
barycentric = rast_out[0, ..., :-1]
|
194 |
+
findices = rast_out[0, ..., -1]
|
195 |
+
if uv.dim() == 2:
|
196 |
+
uv = uv.unsqueeze(0)
|
197 |
+
textc = self.raster.interpolate(uv, findices, barycentric, uv_idx)
|
198 |
+
else:
|
199 |
+
raise f'No raster named {self.raster_mode}'
|
200 |
+
|
201 |
+
return textc, textd
|
202 |
+
|
203 |
+
def raster_texture(self, tex, uv, uv_da=None, mip_level_bias=None, mip=None, filter_mode='auto',
|
204 |
+
boundary_mode='wrap', max_mip_level=None):
|
205 |
+
|
206 |
+
if self.raster_mode == 'cr':
|
207 |
+
raise f'Texture is not implemented in cr'
|
208 |
+
else:
|
209 |
+
raise f'No raster named {self.raster_mode}'
|
210 |
+
|
211 |
+
return color
|
212 |
+
|
213 |
+
def raster_antialias(self, color, rast, pos, tri, topology_hash=None, pos_gradient_boost=1.0):
|
214 |
+
|
215 |
+
if self.raster_mode == 'cr':
|
216 |
+
# Antialias has not been supported yet
|
217 |
+
color = color
|
218 |
+
else:
|
219 |
+
raise f'No raster named {self.raster_mode}'
|
220 |
+
|
221 |
+
return color
|
222 |
+
|
223 |
+
def load_mesh(
|
224 |
+
self,
|
225 |
+
mesh,
|
226 |
+
scale_factor=1.15,
|
227 |
+
auto_center=True,
|
228 |
+
):
|
229 |
+
vtx_pos, pos_idx, vtx_uv, uv_idx, texture_data = load_mesh(mesh)
|
230 |
+
self.mesh_copy = mesh
|
231 |
+
self.set_mesh(vtx_pos, pos_idx,
|
232 |
+
vtx_uv=vtx_uv, uv_idx=uv_idx,
|
233 |
+
scale_factor=scale_factor, auto_center=auto_center
|
234 |
+
)
|
235 |
+
if texture_data is not None:
|
236 |
+
self.set_texture(texture_data)
|
237 |
+
|
238 |
+
def save_mesh(self):
|
239 |
+
texture_data = self.get_texture()
|
240 |
+
texture_data = Image.fromarray((texture_data * 255).astype(np.uint8))
|
241 |
+
return save_mesh(self.mesh_copy, texture_data)
|
242 |
+
|
243 |
+
def set_mesh(
|
244 |
+
self,
|
245 |
+
vtx_pos, pos_idx,
|
246 |
+
vtx_uv=None, uv_idx=None,
|
247 |
+
scale_factor=1.15, auto_center=True
|
248 |
+
):
|
249 |
+
|
250 |
+
self.vtx_pos = torch.from_numpy(vtx_pos).to(self.device).float()
|
251 |
+
self.pos_idx = torch.from_numpy(pos_idx).to(self.device).to(torch.int)
|
252 |
+
if (vtx_uv is not None) and (uv_idx is not None):
|
253 |
+
self.vtx_uv = torch.from_numpy(vtx_uv).to(self.device).float()
|
254 |
+
self.uv_idx = torch.from_numpy(uv_idx).to(self.device).to(torch.int)
|
255 |
+
else:
|
256 |
+
self.vtx_uv = None
|
257 |
+
self.uv_idx = None
|
258 |
+
|
259 |
+
self.vtx_pos[:, [0, 1]] = -self.vtx_pos[:, [0, 1]]
|
260 |
+
self.vtx_pos[:, [1, 2]] = self.vtx_pos[:, [2, 1]]
|
261 |
+
if (vtx_uv is not None) and (uv_idx is not None):
|
262 |
+
self.vtx_uv[:, 1] = 1.0 - self.vtx_uv[:, 1]
|
263 |
+
|
264 |
+
if auto_center:
|
265 |
+
max_bb = (self.vtx_pos - 0).max(0)[0]
|
266 |
+
min_bb = (self.vtx_pos - 0).min(0)[0]
|
267 |
+
center = (max_bb + min_bb) / 2
|
268 |
+
scale = torch.norm(self.vtx_pos - center, dim=1).max() * 2.0
|
269 |
+
self.vtx_pos = (self.vtx_pos - center) * \
|
270 |
+
(scale_factor / float(scale))
|
271 |
+
self.scale_factor = scale_factor
|
272 |
+
|
273 |
+
def set_texture(self, tex):
|
274 |
+
if isinstance(tex, np.ndarray):
|
275 |
+
tex = Image.fromarray((tex * 255).astype(np.uint8))
|
276 |
+
elif isinstance(tex, torch.Tensor):
|
277 |
+
tex = tex.cpu().numpy()
|
278 |
+
tex = Image.fromarray((tex * 255).astype(np.uint8))
|
279 |
+
|
280 |
+
tex = tex.resize(self.texture_size).convert('RGB')
|
281 |
+
tex = np.array(tex) / 255.0
|
282 |
+
self.tex = torch.from_numpy(tex).to(self.device)
|
283 |
+
self.tex = self.tex.float()
|
284 |
+
|
285 |
+
def set_default_render_resolution(self, default_resolution):
|
286 |
+
if isinstance(default_resolution, int):
|
287 |
+
default_resolution = (default_resolution, default_resolution)
|
288 |
+
self.default_resolution = default_resolution
|
289 |
+
|
290 |
+
def set_default_texture_resolution(self, texture_size):
|
291 |
+
if isinstance(texture_size, int):
|
292 |
+
texture_size = (texture_size, texture_size)
|
293 |
+
self.texture_size = texture_size
|
294 |
+
|
295 |
+
def get_mesh(self):
|
296 |
+
vtx_pos = self.vtx_pos.cpu().numpy()
|
297 |
+
pos_idx = self.pos_idx.cpu().numpy()
|
298 |
+
vtx_uv = self.vtx_uv.cpu().numpy()
|
299 |
+
uv_idx = self.uv_idx.cpu().numpy()
|
300 |
+
|
301 |
+
# 坐标变换的逆变换
|
302 |
+
vtx_pos[:, [1, 2]] = vtx_pos[:, [2, 1]]
|
303 |
+
vtx_pos[:, [0, 1]] = -vtx_pos[:, [0, 1]]
|
304 |
+
|
305 |
+
vtx_uv[:, 1] = 1.0 - vtx_uv[:, 1]
|
306 |
+
return vtx_pos, pos_idx, vtx_uv, uv_idx
|
307 |
+
|
308 |
+
def get_texture(self):
|
309 |
+
return self.tex.cpu().numpy()
|
310 |
+
|
311 |
+
def to(self, device):
|
312 |
+
self.device = device
|
313 |
+
|
314 |
+
for attr_name in dir(self):
|
315 |
+
attr_value = getattr(self, attr_name)
|
316 |
+
if isinstance(attr_value, torch.Tensor):
|
317 |
+
setattr(self, attr_name, attr_value.to(self.device))
|
318 |
+
|
319 |
+
def color_rgb_to_srgb(self, image):
|
320 |
+
if isinstance(image, Image.Image):
|
321 |
+
image_rgb = torch.tesnor(
|
322 |
+
np.array(image) /
|
323 |
+
255.0).float().to(
|
324 |
+
self.device)
|
325 |
+
elif isinstance(image, np.ndarray):
|
326 |
+
image_rgb = torch.tensor(image).float()
|
327 |
+
else:
|
328 |
+
image_rgb = image.to(self.device)
|
329 |
+
|
330 |
+
image_srgb = torch.where(
|
331 |
+
image_rgb <= 0.0031308,
|
332 |
+
12.92 * image_rgb,
|
333 |
+
1.055 * torch.pow(image_rgb, 1 / 2.4) - 0.055
|
334 |
+
)
|
335 |
+
|
336 |
+
if isinstance(image, Image.Image):
|
337 |
+
image_srgb = Image.fromarray(
|
338 |
+
(image_srgb.cpu().numpy() *
|
339 |
+
255).astype(
|
340 |
+
np.uint8))
|
341 |
+
elif isinstance(image, np.ndarray):
|
342 |
+
image_srgb = image_srgb.cpu().numpy()
|
343 |
+
else:
|
344 |
+
image_srgb = image_srgb.to(image.device)
|
345 |
+
|
346 |
+
return image_srgb
|
347 |
+
|
348 |
+
def _render(
|
349 |
+
self,
|
350 |
+
glctx,
|
351 |
+
mvp,
|
352 |
+
pos,
|
353 |
+
pos_idx,
|
354 |
+
uv,
|
355 |
+
uv_idx,
|
356 |
+
tex,
|
357 |
+
resolution,
|
358 |
+
max_mip_level,
|
359 |
+
keep_alpha,
|
360 |
+
filter_mode
|
361 |
+
):
|
362 |
+
pos_clip = transform_pos(mvp, pos)
|
363 |
+
if isinstance(resolution, (int, float)):
|
364 |
+
resolution = [resolution, resolution]
|
365 |
+
rast_out, rast_out_db = self.raster_rasterize(
|
366 |
+
glctx, pos_clip, pos_idx, resolution=resolution)
|
367 |
+
|
368 |
+
tex = tex.contiguous()
|
369 |
+
if filter_mode == 'linear-mipmap-linear':
|
370 |
+
texc, texd = self.raster_interpolate(
|
371 |
+
uv[None, ...], rast_out, uv_idx, rast_db=rast_out_db, diff_attrs='all')
|
372 |
+
color = self.raster_texture(
|
373 |
+
tex[None, ...], texc, texd, filter_mode='linear-mipmap-linear', max_mip_level=max_mip_level)
|
374 |
+
else:
|
375 |
+
texc, _ = self.raster_interpolate(uv[None, ...], rast_out, uv_idx)
|
376 |
+
color = self.raster_texture(tex[None, ...], texc, filter_mode=filter_mode)
|
377 |
+
|
378 |
+
visible_mask = torch.clamp(rast_out[..., -1:], 0, 1)
|
379 |
+
color = color * visible_mask # Mask out background.
|
380 |
+
if self.use_antialias:
|
381 |
+
color = self.raster_antialias(color, rast_out, pos_clip, pos_idx)
|
382 |
+
|
383 |
+
if keep_alpha:
|
384 |
+
color = torch.cat([color, visible_mask], dim=-1)
|
385 |
+
return color[0, ...]
|
386 |
+
|
387 |
+
def render(
|
388 |
+
self,
|
389 |
+
elev,
|
390 |
+
azim,
|
391 |
+
camera_distance=None,
|
392 |
+
center=None,
|
393 |
+
resolution=None,
|
394 |
+
tex=None,
|
395 |
+
keep_alpha=True,
|
396 |
+
bgcolor=None,
|
397 |
+
filter_mode=None,
|
398 |
+
return_type='th'
|
399 |
+
):
|
400 |
+
|
401 |
+
proj = self.camera_proj_mat
|
402 |
+
r_mv = get_mv_matrix(
|
403 |
+
elev=elev,
|
404 |
+
azim=azim,
|
405 |
+
camera_distance=self.camera_distance if camera_distance is None else camera_distance,
|
406 |
+
center=center)
|
407 |
+
r_mvp = np.matmul(proj, r_mv).astype(np.float32)
|
408 |
+
if tex is not None:
|
409 |
+
if isinstance(tex, Image.Image):
|
410 |
+
tex = torch.tensor(np.array(tex) / 255.0)
|
411 |
+
elif isinstance(tex, np.ndarray):
|
412 |
+
tex = torch.tensor(tex)
|
413 |
+
if tex.dim() == 2:
|
414 |
+
tex = tex.unsqueeze(-1)
|
415 |
+
tex = tex.float().to(self.device)
|
416 |
+
image = self._render(r_mvp, self.vtx_pos, self.pos_idx, self.vtx_uv, self.uv_idx,
|
417 |
+
self.tex if tex is None else tex,
|
418 |
+
self.default_resolution if resolution is None else resolution,
|
419 |
+
self.max_mip_level, True, filter_mode if filter_mode else self.filter_mode)
|
420 |
+
mask = (image[..., [-1]] == 1).float()
|
421 |
+
if bgcolor is None:
|
422 |
+
bgcolor = [0 for _ in range(image.shape[-1] - 1)]
|
423 |
+
image = image * mask + (1 - mask) * \
|
424 |
+
torch.tensor(bgcolor + [0]).to(self.device)
|
425 |
+
if keep_alpha == False:
|
426 |
+
image = image[..., :-1]
|
427 |
+
if return_type == 'np':
|
428 |
+
image = image.cpu().numpy()
|
429 |
+
elif return_type == 'pl':
|
430 |
+
image = image.squeeze(-1).cpu().numpy() * 255
|
431 |
+
image = Image.fromarray(image.astype(np.uint8))
|
432 |
+
return image
|
433 |
+
|
434 |
+
def render_normal(
|
435 |
+
self,
|
436 |
+
elev,
|
437 |
+
azim,
|
438 |
+
camera_distance=None,
|
439 |
+
center=None,
|
440 |
+
resolution=None,
|
441 |
+
bg_color=[1, 1, 1],
|
442 |
+
use_abs_coor=False,
|
443 |
+
normalize_rgb=True,
|
444 |
+
return_type='th'
|
445 |
+
):
|
446 |
+
|
447 |
+
pos_camera, pos_clip = self.get_pos_from_mvp(elev, azim, camera_distance, center)
|
448 |
+
if resolution is None:
|
449 |
+
resolution = self.default_resolution
|
450 |
+
if isinstance(resolution, (int, float)):
|
451 |
+
resolution = [resolution, resolution]
|
452 |
+
rast_out, rast_out_db = self.raster_rasterize(
|
453 |
+
pos_clip, self.pos_idx, resolution=resolution)
|
454 |
+
|
455 |
+
if use_abs_coor:
|
456 |
+
mesh_triangles = self.vtx_pos[self.pos_idx[:, :3], :]
|
457 |
+
else:
|
458 |
+
pos_camera = pos_camera[:, :3] / pos_camera[:, 3:4]
|
459 |
+
mesh_triangles = pos_camera[self.pos_idx[:, :3], :]
|
460 |
+
face_normals = F.normalize(
|
461 |
+
torch.cross(mesh_triangles[:,
|
462 |
+
1,
|
463 |
+
:] - mesh_triangles[:,
|
464 |
+
0,
|
465 |
+
:],
|
466 |
+
mesh_triangles[:,
|
467 |
+
2,
|
468 |
+
:] - mesh_triangles[:,
|
469 |
+
0,
|
470 |
+
:],
|
471 |
+
dim=-1),
|
472 |
+
dim=-1)
|
473 |
+
|
474 |
+
vertex_normals = trimesh.geometry.mean_vertex_normals(vertex_count=self.vtx_pos.shape[0],
|
475 |
+
faces=self.pos_idx.cpu(),
|
476 |
+
face_normals=face_normals.cpu(), )
|
477 |
+
vertex_normals = torch.from_numpy(
|
478 |
+
vertex_normals).float().to(self.device).contiguous()
|
479 |
+
|
480 |
+
# Interpolate normal values across the rasterized pixels
|
481 |
+
normal, _ = self.raster_interpolate(
|
482 |
+
vertex_normals[None, ...], rast_out, self.pos_idx)
|
483 |
+
|
484 |
+
visible_mask = torch.clamp(rast_out[..., -1:], 0, 1)
|
485 |
+
normal = normal * visible_mask + \
|
486 |
+
torch.tensor(bg_color, dtype=torch.float32, device=self.device) * (1 -
|
487 |
+
visible_mask) # Mask out background.
|
488 |
+
|
489 |
+
if normalize_rgb:
|
490 |
+
normal = (normal + 1) * 0.5
|
491 |
+
if self.use_antialias:
|
492 |
+
normal = self.raster_antialias(normal, rast_out, pos_clip, self.pos_idx)
|
493 |
+
|
494 |
+
image = normal[0, ...]
|
495 |
+
if return_type == 'np':
|
496 |
+
image = image.cpu().numpy()
|
497 |
+
elif return_type == 'pl':
|
498 |
+
image = image.cpu().numpy() * 255
|
499 |
+
image = Image.fromarray(image.astype(np.uint8))
|
500 |
+
|
501 |
+
return image
|
502 |
+
|
503 |
+
def convert_normal_map(self, image):
|
504 |
+
# blue is front, red is left, green is top
|
505 |
+
if isinstance(image, Image.Image):
|
506 |
+
image = np.array(image)
|
507 |
+
mask = (image == [255, 255, 255]).all(axis=-1)
|
508 |
+
|
509 |
+
image = (image / 255.0) * 2.0 - 1.0
|
510 |
+
|
511 |
+
image[..., [1]] = -image[..., [1]]
|
512 |
+
image[..., [1, 2]] = image[..., [2, 1]]
|
513 |
+
image[..., [0]] = -image[..., [0]]
|
514 |
+
|
515 |
+
image = (image + 1.0) * 0.5
|
516 |
+
|
517 |
+
image = (image * 255).astype(np.uint8)
|
518 |
+
image[mask] = [127, 127, 255]
|
519 |
+
|
520 |
+
return Image.fromarray(image)
|
521 |
+
|
522 |
+
def get_pos_from_mvp(self, elev, azim, camera_distance, center):
|
523 |
+
proj = self.camera_proj_mat
|
524 |
+
r_mv = get_mv_matrix(
|
525 |
+
elev=elev,
|
526 |
+
azim=azim,
|
527 |
+
camera_distance=self.camera_distance if camera_distance is None else camera_distance,
|
528 |
+
center=center)
|
529 |
+
|
530 |
+
pos_camera = transform_pos(r_mv, self.vtx_pos, keepdim=True)
|
531 |
+
pos_clip = transform_pos(proj, pos_camera)
|
532 |
+
|
533 |
+
return pos_camera, pos_clip
|
534 |
+
|
535 |
+
def render_depth(
|
536 |
+
self,
|
537 |
+
elev,
|
538 |
+
azim,
|
539 |
+
camera_distance=None,
|
540 |
+
center=None,
|
541 |
+
resolution=None,
|
542 |
+
return_type='th'
|
543 |
+
):
|
544 |
+
pos_camera, pos_clip = self.get_pos_from_mvp(elev, azim, camera_distance, center)
|
545 |
+
|
546 |
+
if resolution is None:
|
547 |
+
resolution = self.default_resolution
|
548 |
+
if isinstance(resolution, (int, float)):
|
549 |
+
resolution = [resolution, resolution]
|
550 |
+
rast_out, rast_out_db = self.raster_rasterize(
|
551 |
+
pos_clip, self.pos_idx, resolution=resolution)
|
552 |
+
|
553 |
+
pos_camera = pos_camera[:, :3] / pos_camera[:, 3:4]
|
554 |
+
tex_depth = pos_camera[:, 2].reshape(1, -1, 1).contiguous()
|
555 |
+
|
556 |
+
# Interpolate depth values across the rasterized pixels
|
557 |
+
depth, _ = self.raster_interpolate(tex_depth, rast_out, self.pos_idx)
|
558 |
+
|
559 |
+
visible_mask = torch.clamp(rast_out[..., -1:], 0, 1)
|
560 |
+
depth_max, depth_min = depth[visible_mask >
|
561 |
+
0].max(), depth[visible_mask > 0].min()
|
562 |
+
depth = (depth - depth_min) / (depth_max - depth_min)
|
563 |
+
|
564 |
+
depth = depth * visible_mask # Mask out background.
|
565 |
+
if self.use_antialias:
|
566 |
+
depth = self.raster_antialias(depth, rast_out, pos_clip, self.pos_idx)
|
567 |
+
|
568 |
+
image = depth[0, ...]
|
569 |
+
if return_type == 'np':
|
570 |
+
image = image.cpu().numpy()
|
571 |
+
elif return_type == 'pl':
|
572 |
+
image = image.squeeze(-1).cpu().numpy() * 255
|
573 |
+
image = Image.fromarray(image.astype(np.uint8))
|
574 |
+
return image
|
575 |
+
|
576 |
+
def render_position(self, elev, azim, camera_distance=None, center=None,
|
577 |
+
resolution=None, bg_color=[1, 1, 1], return_type='th'):
|
578 |
+
pos_camera, pos_clip = self.get_pos_from_mvp(elev, azim, camera_distance, center)
|
579 |
+
if resolution is None:
|
580 |
+
resolution = self.default_resolution
|
581 |
+
if isinstance(resolution, (int, float)):
|
582 |
+
resolution = [resolution, resolution]
|
583 |
+
rast_out, rast_out_db = self.raster_rasterize(
|
584 |
+
pos_clip, self.pos_idx, resolution=resolution)
|
585 |
+
|
586 |
+
tex_position = 0.5 - self.vtx_pos[:, :3] / self.scale_factor
|
587 |
+
tex_position = tex_position.contiguous()
|
588 |
+
|
589 |
+
# Interpolate depth values across the rasterized pixels
|
590 |
+
position, _ = self.raster_interpolate(
|
591 |
+
tex_position[None, ...], rast_out, self.pos_idx)
|
592 |
+
|
593 |
+
visible_mask = torch.clamp(rast_out[..., -1:], 0, 1)
|
594 |
+
|
595 |
+
position = position * visible_mask + \
|
596 |
+
torch.tensor(bg_color, dtype=torch.float32, device=self.device) * (1 -
|
597 |
+
visible_mask) # Mask out background.
|
598 |
+
if self.use_antialias:
|
599 |
+
position = self.raster_antialias(position, rast_out, pos_clip, self.pos_idx)
|
600 |
+
|
601 |
+
image = position[0, ...]
|
602 |
+
|
603 |
+
if return_type == 'np':
|
604 |
+
image = image.cpu().numpy()
|
605 |
+
elif return_type == 'pl':
|
606 |
+
image = image.squeeze(-1).cpu().numpy() * 255
|
607 |
+
image = Image.fromarray(image.astype(np.uint8))
|
608 |
+
return image
|
609 |
+
|
610 |
+
def render_uvpos(self, return_type='th'):
|
611 |
+
image = self.uv_feature_map(self.vtx_pos * 0.5 + 0.5)
|
612 |
+
if return_type == 'np':
|
613 |
+
image = image.cpu().numpy()
|
614 |
+
elif return_type == 'pl':
|
615 |
+
image = image.cpu().numpy() * 255
|
616 |
+
image = Image.fromarray(image.astype(np.uint8))
|
617 |
+
return image
|
618 |
+
|
619 |
+
def uv_feature_map(self, vert_feat, bg=None):
|
620 |
+
vtx_uv = self.vtx_uv * 2 - 1.0
|
621 |
+
vtx_uv = torch.cat(
|
622 |
+
[vtx_uv, torch.zeros_like(self.vtx_uv)], dim=1).unsqueeze(0)
|
623 |
+
vtx_uv[..., -1] = 1
|
624 |
+
uv_idx = self.uv_idx
|
625 |
+
rast_out, rast_out_db = self.raster_rasterize(
|
626 |
+
vtx_uv, uv_idx, resolution=self.texture_size)
|
627 |
+
feat_map, _ = self.raster_interpolate(vert_feat[None, ...], rast_out, uv_idx)
|
628 |
+
feat_map = feat_map[0, ...]
|
629 |
+
if bg is not None:
|
630 |
+
visible_mask = torch.clamp(rast_out[..., -1:], 0, 1)[0, ...]
|
631 |
+
feat_map[visible_mask == 0] = bg
|
632 |
+
return feat_map
|
633 |
+
|
634 |
+
def render_sketch_from_geometry(self, normal_image, depth_image):
|
635 |
+
normal_image_np = normal_image.cpu().numpy()
|
636 |
+
depth_image_np = depth_image.cpu().numpy()
|
637 |
+
|
638 |
+
normal_image_np = (normal_image_np * 255).astype(np.uint8)
|
639 |
+
depth_image_np = (depth_image_np * 255).astype(np.uint8)
|
640 |
+
normal_image_np = cv2.cvtColor(normal_image_np, cv2.COLOR_RGB2GRAY)
|
641 |
+
|
642 |
+
normal_edges = cv2.Canny(normal_image_np, 80, 150)
|
643 |
+
depth_edges = cv2.Canny(depth_image_np, 30, 80)
|
644 |
+
|
645 |
+
combined_edges = np.maximum(normal_edges, depth_edges)
|
646 |
+
|
647 |
+
sketch_image = torch.from_numpy(combined_edges).to(
|
648 |
+
normal_image.device).float() / 255.0
|
649 |
+
sketch_image = sketch_image.unsqueeze(-1)
|
650 |
+
|
651 |
+
return sketch_image
|
652 |
+
|
653 |
+
def render_sketch_from_depth(self, depth_image):
|
654 |
+
depth_image_np = depth_image.cpu().numpy()
|
655 |
+
depth_image_np = (depth_image_np * 255).astype(np.uint8)
|
656 |
+
depth_edges = cv2.Canny(depth_image_np, 30, 80)
|
657 |
+
combined_edges = depth_edges
|
658 |
+
sketch_image = torch.from_numpy(combined_edges).to(
|
659 |
+
depth_image.device).float() / 255.0
|
660 |
+
sketch_image = sketch_image.unsqueeze(-1)
|
661 |
+
return sketch_image
|
662 |
+
|
663 |
+
def back_project(self, image, elev, azim,
|
664 |
+
camera_distance=None, center=None, method=None):
|
665 |
+
if isinstance(image, Image.Image):
|
666 |
+
image = torch.tensor(np.array(image) / 255.0)
|
667 |
+
elif isinstance(image, np.ndarray):
|
668 |
+
image = torch.tensor(image)
|
669 |
+
if image.dim() == 2:
|
670 |
+
image = image.unsqueeze(-1)
|
671 |
+
image = image.float().to(self.device)
|
672 |
+
resolution = image.shape[:2]
|
673 |
+
channel = image.shape[-1]
|
674 |
+
texture = torch.zeros(self.texture_size + (channel,)).to(self.device)
|
675 |
+
cos_map = torch.zeros(self.texture_size + (1,)).to(self.device)
|
676 |
+
|
677 |
+
proj = self.camera_proj_mat
|
678 |
+
r_mv = get_mv_matrix(
|
679 |
+
elev=elev,
|
680 |
+
azim=azim,
|
681 |
+
camera_distance=self.camera_distance if camera_distance is None else camera_distance,
|
682 |
+
center=center)
|
683 |
+
pos_camera = transform_pos(r_mv, self.vtx_pos, keepdim=True)
|
684 |
+
pos_clip = transform_pos(proj, pos_camera)
|
685 |
+
pos_camera = pos_camera[:, :3] / pos_camera[:, 3:4]
|
686 |
+
v0 = pos_camera[self.pos_idx[:, 0], :]
|
687 |
+
v1 = pos_camera[self.pos_idx[:, 1], :]
|
688 |
+
v2 = pos_camera[self.pos_idx[:, 2], :]
|
689 |
+
face_normals = F.normalize(
|
690 |
+
torch.cross(
|
691 |
+
v1 - v0,
|
692 |
+
v2 - v0,
|
693 |
+
dim=-1),
|
694 |
+
dim=-1)
|
695 |
+
vertex_normals = trimesh.geometry.mean_vertex_normals(vertex_count=self.vtx_pos.shape[0],
|
696 |
+
faces=self.pos_idx.cpu(),
|
697 |
+
face_normals=face_normals.cpu(), )
|
698 |
+
vertex_normals = torch.from_numpy(
|
699 |
+
vertex_normals).float().to(self.device).contiguous()
|
700 |
+
tex_depth = pos_camera[:, 2].reshape(1, -1, 1).contiguous()
|
701 |
+
rast_out, rast_out_db = self.raster_rasterize(
|
702 |
+
pos_clip, self.pos_idx, resolution=resolution)
|
703 |
+
visible_mask = torch.clamp(rast_out[..., -1:], 0, 1)[0, ...]
|
704 |
+
|
705 |
+
normal, _ = self.raster_interpolate(
|
706 |
+
vertex_normals[None, ...], rast_out, self.pos_idx)
|
707 |
+
normal = normal[0, ...]
|
708 |
+
uv, _ = self.raster_interpolate(self.vtx_uv[None, ...], rast_out, self.uv_idx)
|
709 |
+
depth, _ = self.raster_interpolate(tex_depth, rast_out, self.pos_idx)
|
710 |
+
depth = depth[0, ...]
|
711 |
+
|
712 |
+
depth_max, depth_min = depth[visible_mask >
|
713 |
+
0].max(), depth[visible_mask > 0].min()
|
714 |
+
depth_normalized = (depth - depth_min) / (depth_max - depth_min)
|
715 |
+
depth_image = depth_normalized * visible_mask # Mask out background.
|
716 |
+
|
717 |
+
sketch_image = self.render_sketch_from_depth(depth_image)
|
718 |
+
|
719 |
+
lookat = torch.tensor([[0, 0, -1]], device=self.device)
|
720 |
+
cos_image = torch.nn.functional.cosine_similarity(
|
721 |
+
lookat, normal.view(-1, 3))
|
722 |
+
cos_image = cos_image.view(normal.shape[0], normal.shape[1], 1)
|
723 |
+
|
724 |
+
cos_thres = np.cos(self.bake_angle_thres / 180 * np.pi)
|
725 |
+
cos_image[cos_image < cos_thres] = 0
|
726 |
+
|
727 |
+
# shrink
|
728 |
+
kernel_size = self.bake_unreliable_kernel_size * 2 + 1
|
729 |
+
kernel = torch.ones(
|
730 |
+
(1, 1, kernel_size, kernel_size), dtype=torch.float32).to(
|
731 |
+
sketch_image.device)
|
732 |
+
|
733 |
+
visible_mask = visible_mask.permute(2, 0, 1).unsqueeze(0).float()
|
734 |
+
visible_mask = F.conv2d(
|
735 |
+
1.0 - visible_mask,
|
736 |
+
kernel,
|
737 |
+
padding=kernel_size // 2)
|
738 |
+
visible_mask = 1.0 - (visible_mask > 0).float() # 二值化
|
739 |
+
visible_mask = visible_mask.squeeze(0).permute(1, 2, 0)
|
740 |
+
|
741 |
+
sketch_image = sketch_image.permute(2, 0, 1).unsqueeze(0)
|
742 |
+
sketch_image = F.conv2d(sketch_image, kernel, padding=kernel_size // 2)
|
743 |
+
sketch_image = (sketch_image > 0).float() # 二值化
|
744 |
+
sketch_image = sketch_image.squeeze(0).permute(1, 2, 0)
|
745 |
+
visible_mask = visible_mask * (sketch_image < 0.5)
|
746 |
+
|
747 |
+
cos_image[visible_mask == 0] = 0
|
748 |
+
|
749 |
+
method = self.bake_mode if method is None else method
|
750 |
+
|
751 |
+
if method == 'linear':
|
752 |
+
proj_mask = (visible_mask != 0).view(-1)
|
753 |
+
uv = uv.squeeze(0).contiguous().view(-1, 2)[proj_mask]
|
754 |
+
image = image.squeeze(0).contiguous().view(-1, channel)[proj_mask]
|
755 |
+
cos_image = cos_image.contiguous().view(-1, 1)[proj_mask]
|
756 |
+
sketch_image = sketch_image.contiguous().view(-1, 1)[proj_mask]
|
757 |
+
|
758 |
+
texture = linear_grid_put_2d(
|
759 |
+
self.texture_size[1], self.texture_size[0], uv[..., [1, 0]], image)
|
760 |
+
cos_map = linear_grid_put_2d(
|
761 |
+
self.texture_size[1], self.texture_size[0], uv[..., [1, 0]], cos_image)
|
762 |
+
boundary_map = linear_grid_put_2d(
|
763 |
+
self.texture_size[1], self.texture_size[0], uv[..., [1, 0]], sketch_image)
|
764 |
+
else:
|
765 |
+
raise f'No bake mode {method}'
|
766 |
+
|
767 |
+
return texture, cos_map, boundary_map
|
768 |
+
|
769 |
+
def bake_texture(self, colors, elevs, azims,
|
770 |
+
camera_distance=None, center=None, exp=6, weights=None):
|
771 |
+
for i in range(len(colors)):
|
772 |
+
if isinstance(colors[i], Image.Image):
|
773 |
+
colors[i] = torch.tensor(
|
774 |
+
np.array(
|
775 |
+
colors[i]) / 255.0,
|
776 |
+
device=self.device).float()
|
777 |
+
if weights is None:
|
778 |
+
weights = [1.0 for _ in range(colors)]
|
779 |
+
textures = []
|
780 |
+
cos_maps = []
|
781 |
+
for color, elev, azim, weight in zip(colors, elevs, azims, weights):
|
782 |
+
texture, cos_map, _ = self.back_project(
|
783 |
+
color, elev, azim, camera_distance, center)
|
784 |
+
cos_map = weight * (cos_map ** exp)
|
785 |
+
textures.append(texture)
|
786 |
+
cos_maps.append(cos_map)
|
787 |
+
|
788 |
+
texture_merge, trust_map_merge = self.fast_bake_texture(
|
789 |
+
textures, cos_maps)
|
790 |
+
return texture_merge, trust_map_merge
|
791 |
+
|
792 |
+
@torch.no_grad()
|
793 |
+
def fast_bake_texture(self, textures, cos_maps):
|
794 |
+
|
795 |
+
channel = textures[0].shape[-1]
|
796 |
+
texture_merge = torch.zeros(
|
797 |
+
self.texture_size + (channel,)).to(self.device)
|
798 |
+
trust_map_merge = torch.zeros(self.texture_size + (1,)).to(self.device)
|
799 |
+
for texture, cos_map in zip(textures, cos_maps):
|
800 |
+
view_sum = (cos_map > 0).sum()
|
801 |
+
painted_sum = ((cos_map > 0) * (trust_map_merge > 0)).sum()
|
802 |
+
if painted_sum / view_sum > 0.99:
|
803 |
+
continue
|
804 |
+
texture_merge += texture * cos_map
|
805 |
+
trust_map_merge += cos_map
|
806 |
+
texture_merge = texture_merge / torch.clamp(trust_map_merge, min=1E-8)
|
807 |
+
|
808 |
+
return texture_merge, trust_map_merge > 1E-8
|
809 |
+
|
810 |
+
def uv_inpaint(self, texture, mask):
|
811 |
+
|
812 |
+
if isinstance(texture, torch.Tensor):
|
813 |
+
texture_np = texture.cpu().numpy()
|
814 |
+
elif isinstance(texture, np.ndarray):
|
815 |
+
texture_np = texture
|
816 |
+
elif isinstance(texture, Image.Image):
|
817 |
+
texture_np = np.array(texture) / 255.0
|
818 |
+
|
819 |
+
vtx_pos, pos_idx, vtx_uv, uv_idx = self.get_mesh()
|
820 |
+
|
821 |
+
texture_np, mask = meshVerticeInpaint(
|
822 |
+
texture_np, mask, vtx_pos, vtx_uv, pos_idx, uv_idx)
|
823 |
+
|
824 |
+
texture_np = cv2.inpaint(
|
825 |
+
(texture_np *
|
826 |
+
255).astype(
|
827 |
+
np.uint8),
|
828 |
+
255 -
|
829 |
+
mask,
|
830 |
+
3,
|
831 |
+
cv2.INPAINT_NS)
|
832 |
+
|
833 |
+
return texture_np
|
build/lib/hy3dgen/texgen/differentiable_renderer/mesh_utils.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Open Source Model Licensed under the Apache License Version 2.0
|
2 |
+
# and Other Licenses of the Third-Party Components therein:
|
3 |
+
# The below Model in this distribution may have been modified by THL A29 Limited
|
4 |
+
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
|
5 |
+
|
6 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
7 |
+
# The below software and/or models in this distribution may have been
|
8 |
+
# modified by THL A29 Limited ("Tencent Modifications").
|
9 |
+
# All Tencent Modifications are Copyright (C) THL A29 Limited.
|
10 |
+
|
11 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
12 |
+
# except for the third-party components listed below.
|
13 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
14 |
+
# in the repsective licenses of these third-party components.
|
15 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
16 |
+
# components and must ensure that the usage of the third party components adheres to
|
17 |
+
# all relevant laws and regulations.
|
18 |
+
|
19 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
20 |
+
# their software and algorithms, including trained model weights, parameters (including
|
21 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
22 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
23 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
24 |
+
|
25 |
+
import trimesh
|
26 |
+
|
27 |
+
|
28 |
+
def load_mesh(mesh):
|
29 |
+
vtx_pos = mesh.vertices if hasattr(mesh, 'vertices') else None
|
30 |
+
pos_idx = mesh.faces if hasattr(mesh, 'faces') else None
|
31 |
+
|
32 |
+
vtx_uv = mesh.visual.uv if hasattr(mesh.visual, 'uv') else None
|
33 |
+
uv_idx = mesh.faces if hasattr(mesh, 'faces') else None
|
34 |
+
|
35 |
+
texture_data = None
|
36 |
+
|
37 |
+
return vtx_pos, pos_idx, vtx_uv, uv_idx, texture_data
|
38 |
+
|
39 |
+
|
40 |
+
def save_mesh(mesh, texture_data):
|
41 |
+
material = trimesh.visual.texture.SimpleMaterial(image=texture_data, diffuse=(255, 255, 255))
|
42 |
+
texture_visuals = trimesh.visual.TextureVisuals(uv=mesh.visual.uv, image=texture_data, material=material)
|
43 |
+
mesh.visual = texture_visuals
|
44 |
+
return mesh
|
build/lib/hy3dgen/texgen/differentiable_renderer/setup.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from setuptools import setup, Extension
|
2 |
+
import pybind11
|
3 |
+
import sys
|
4 |
+
import platform
|
5 |
+
|
6 |
+
def get_platform_specific_args():
|
7 |
+
system = platform.system().lower()
|
8 |
+
cpp_std = 'c++14' # Make configurable if needed
|
9 |
+
|
10 |
+
if sys.platform == 'win32':
|
11 |
+
compile_args = ['/O2', f'/std:{cpp_std}', '/EHsc', '/MP', '/DWIN32_LEAN_AND_MEAN', '/bigobj']
|
12 |
+
link_args = []
|
13 |
+
extra_includes = []
|
14 |
+
elif system == 'linux':
|
15 |
+
compile_args = ['-O3', f'-std={cpp_std}', '-fPIC', '-Wall', '-Wextra', '-pthread']
|
16 |
+
link_args = ['-fPIC', '-pthread']
|
17 |
+
extra_includes = []
|
18 |
+
elif sys.platform == 'darwin':
|
19 |
+
compile_args = ['-O3', f'-std={cpp_std}', '-fPIC', '-Wall', '-Wextra',
|
20 |
+
'-stdlib=libc++', '-mmacosx-version-min=10.14']
|
21 |
+
link_args = ['-fPIC', '-stdlib=libc++', '-mmacosx-version-min=10.14', '-dynamiclib']
|
22 |
+
extra_includes = []
|
23 |
+
else:
|
24 |
+
raise RuntimeError(f"Unsupported platform: {system}")
|
25 |
+
|
26 |
+
return compile_args, link_args, extra_includes
|
27 |
+
|
28 |
+
extra_compile_args, extra_link_args, platform_includes = get_platform_specific_args()
|
29 |
+
include_dirs = [pybind11.get_include(), pybind11.get_include(user=True)]
|
30 |
+
include_dirs.extend(platform_includes)
|
31 |
+
|
32 |
+
ext_modules = [
|
33 |
+
Extension(
|
34 |
+
"mesh_processor",
|
35 |
+
["mesh_processor.cpp"],
|
36 |
+
include_dirs=include_dirs,
|
37 |
+
language='c++',
|
38 |
+
extra_compile_args=extra_compile_args,
|
39 |
+
extra_link_args=extra_link_args,
|
40 |
+
),
|
41 |
+
]
|
42 |
+
|
43 |
+
setup(
|
44 |
+
name="mesh_processor",
|
45 |
+
ext_modules=ext_modules,
|
46 |
+
install_requires=['pybind11>=2.6.0'],
|
47 |
+
python_requires='>=3.6',
|
48 |
+
)
|
build/lib/hy3dgen/texgen/hunyuanpaint/__init__.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Open Source Model Licensed under the Apache License Version 2.0
|
2 |
+
# and Other Licenses of the Third-Party Components therein:
|
3 |
+
# The below Model in this distribution may have been modified by THL A29 Limited
|
4 |
+
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
|
5 |
+
|
6 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
7 |
+
# The below software and/or models in this distribution may have been
|
8 |
+
# modified by THL A29 Limited ("Tencent Modifications").
|
9 |
+
# All Tencent Modifications are Copyright (C) THL A29 Limited.
|
10 |
+
|
11 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
12 |
+
# except for the third-party components listed below.
|
13 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
14 |
+
# in the repsective licenses of these third-party components.
|
15 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
16 |
+
# components and must ensure that the usage of the third party components adheres to
|
17 |
+
# all relevant laws and regulations.
|
18 |
+
|
19 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
20 |
+
# their software and algorithms, including trained model weights, parameters (including
|
21 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
22 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
23 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
build/lib/hy3dgen/texgen/hunyuanpaint/pipeline.py
ADDED
@@ -0,0 +1,554 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Open Source Model Licensed under the Apache License Version 2.0
|
2 |
+
# and Other Licenses of the Third-Party Components therein:
|
3 |
+
# The below Model in this distribution may have been modified by THL A29 Limited
|
4 |
+
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
|
5 |
+
|
6 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
7 |
+
# The below software and/or models in this distribution may have been
|
8 |
+
# modified by THL A29 Limited ("Tencent Modifications").
|
9 |
+
# All Tencent Modifications are Copyright (C) THL A29 Limited.
|
10 |
+
|
11 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
12 |
+
# except for the third-party components listed below.
|
13 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
14 |
+
# in the repsective licenses of these third-party components.
|
15 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
16 |
+
# components and must ensure that the usage of the third party components adheres to
|
17 |
+
# all relevant laws and regulations.
|
18 |
+
|
19 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
20 |
+
# their software and algorithms, including trained model weights, parameters (including
|
21 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
22 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
23 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
24 |
+
|
25 |
+
from typing import Any, Callable, Dict, List, Optional, Union
|
26 |
+
|
27 |
+
import numpy
|
28 |
+
import numpy as np
|
29 |
+
import torch
|
30 |
+
import torch.distributed
|
31 |
+
import torch.utils.checkpoint
|
32 |
+
from PIL import Image
|
33 |
+
from diffusers import (
|
34 |
+
AutoencoderKL,
|
35 |
+
DiffusionPipeline,
|
36 |
+
ImagePipelineOutput
|
37 |
+
)
|
38 |
+
from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
|
39 |
+
from diffusers.image_processor import PipelineImageInput
|
40 |
+
from diffusers.image_processor import VaeImageProcessor
|
41 |
+
from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
|
42 |
+
from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipeline, retrieve_timesteps, \
|
43 |
+
rescale_noise_cfg
|
44 |
+
from diffusers.schedulers import KarrasDiffusionSchedulers
|
45 |
+
from diffusers.utils import deprecate
|
46 |
+
from einops import rearrange
|
47 |
+
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
|
48 |
+
|
49 |
+
from .unet.modules import UNet2p5DConditionModel
|
50 |
+
|
51 |
+
|
52 |
+
def to_rgb_image(maybe_rgba: Image.Image):
|
53 |
+
if maybe_rgba.mode == 'RGB':
|
54 |
+
return maybe_rgba
|
55 |
+
elif maybe_rgba.mode == 'RGBA':
|
56 |
+
rgba = maybe_rgba
|
57 |
+
img = numpy.random.randint(127, 128, size=[rgba.size[1], rgba.size[0], 3], dtype=numpy.uint8)
|
58 |
+
img = Image.fromarray(img, 'RGB')
|
59 |
+
img.paste(rgba, mask=rgba.getchannel('A'))
|
60 |
+
return img
|
61 |
+
else:
|
62 |
+
raise ValueError("Unsupported image type.", maybe_rgba.mode)
|
63 |
+
|
64 |
+
|
65 |
+
class HunyuanPaintPipeline(StableDiffusionPipeline):
|
66 |
+
|
67 |
+
def __init__(
|
68 |
+
self,
|
69 |
+
vae: AutoencoderKL,
|
70 |
+
text_encoder: CLIPTextModel,
|
71 |
+
tokenizer: CLIPTokenizer,
|
72 |
+
unet: UNet2p5DConditionModel,
|
73 |
+
scheduler: KarrasDiffusionSchedulers,
|
74 |
+
feature_extractor: CLIPImageProcessor,
|
75 |
+
safety_checker=None,
|
76 |
+
use_torch_compile=False,
|
77 |
+
):
|
78 |
+
DiffusionPipeline.__init__(self)
|
79 |
+
|
80 |
+
safety_checker = None
|
81 |
+
self.register_modules(
|
82 |
+
vae=torch.compile(vae) if use_torch_compile else vae,
|
83 |
+
text_encoder=text_encoder,
|
84 |
+
tokenizer=tokenizer,
|
85 |
+
unet=unet,
|
86 |
+
scheduler=scheduler,
|
87 |
+
safety_checker=safety_checker,
|
88 |
+
feature_extractor=torch.compile(feature_extractor) if use_torch_compile else feature_extractor,
|
89 |
+
)
|
90 |
+
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
91 |
+
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
|
92 |
+
|
93 |
+
@torch.no_grad()
|
94 |
+
def encode_images(self, images):
|
95 |
+
B = images.shape[0]
|
96 |
+
images = rearrange(images, 'b n c h w -> (b n) c h w')
|
97 |
+
|
98 |
+
dtype = next(self.vae.parameters()).dtype
|
99 |
+
images = (images - 0.5) * 2.0
|
100 |
+
posterior = self.vae.encode(images.to(dtype)).latent_dist
|
101 |
+
latents = posterior.sample() * self.vae.config.scaling_factor
|
102 |
+
|
103 |
+
latents = rearrange(latents, '(b n) c h w -> b n c h w', b=B)
|
104 |
+
return latents
|
105 |
+
|
106 |
+
@torch.no_grad()
|
107 |
+
def __call__(
|
108 |
+
self,
|
109 |
+
image: Image.Image = None,
|
110 |
+
prompt=None,
|
111 |
+
negative_prompt='watermark, ugly, deformed, noisy, blurry, low contrast',
|
112 |
+
*args,
|
113 |
+
num_images_per_prompt: Optional[int] = 1,
|
114 |
+
guidance_scale=2.0,
|
115 |
+
output_type: Optional[str] = "pil",
|
116 |
+
width=512,
|
117 |
+
height=512,
|
118 |
+
num_inference_steps=28,
|
119 |
+
return_dict=True,
|
120 |
+
**cached_condition,
|
121 |
+
):
|
122 |
+
if image is None:
|
123 |
+
raise ValueError("Inputting embeddings not supported for this pipeline. Please pass an image.")
|
124 |
+
assert not isinstance(image, torch.Tensor)
|
125 |
+
|
126 |
+
image = to_rgb_image(image)
|
127 |
+
|
128 |
+
image_vae = torch.tensor(np.array(image) / 255.0)
|
129 |
+
image_vae = image_vae.unsqueeze(0).permute(0, 3, 1, 2).unsqueeze(0)
|
130 |
+
image_vae = image_vae.to(device=self.vae.device, dtype=self.vae.dtype)
|
131 |
+
|
132 |
+
batch_size = image_vae.shape[0]
|
133 |
+
assert batch_size == 1
|
134 |
+
assert num_images_per_prompt == 1
|
135 |
+
|
136 |
+
ref_latents = self.encode_images(image_vae)
|
137 |
+
|
138 |
+
def convert_pil_list_to_tensor(images):
|
139 |
+
bg_c = [1., 1., 1.]
|
140 |
+
images_tensor = []
|
141 |
+
for batch_imgs in images:
|
142 |
+
view_imgs = []
|
143 |
+
for pil_img in batch_imgs:
|
144 |
+
img = numpy.asarray(pil_img, dtype=numpy.float32) / 255.
|
145 |
+
if img.shape[2] > 3:
|
146 |
+
alpha = img[:, :, 3:]
|
147 |
+
img = img[:, :, :3] * alpha + bg_c * (1 - alpha)
|
148 |
+
img = torch.from_numpy(img).permute(2, 0, 1).unsqueeze(0).contiguous().half().to("cuda")
|
149 |
+
view_imgs.append(img)
|
150 |
+
view_imgs = torch.cat(view_imgs, dim=0)
|
151 |
+
images_tensor.append(view_imgs.unsqueeze(0))
|
152 |
+
|
153 |
+
images_tensor = torch.cat(images_tensor, dim=0)
|
154 |
+
return images_tensor
|
155 |
+
|
156 |
+
if "normal_imgs" in cached_condition:
|
157 |
+
|
158 |
+
if isinstance(cached_condition["normal_imgs"], List):
|
159 |
+
cached_condition["normal_imgs"] = convert_pil_list_to_tensor(cached_condition["normal_imgs"])
|
160 |
+
|
161 |
+
cached_condition['normal_imgs'] = self.encode_images(cached_condition["normal_imgs"])
|
162 |
+
|
163 |
+
if "position_imgs" in cached_condition:
|
164 |
+
|
165 |
+
if isinstance(cached_condition["position_imgs"], List):
|
166 |
+
cached_condition["position_imgs"] = convert_pil_list_to_tensor(cached_condition["position_imgs"])
|
167 |
+
|
168 |
+
cached_condition["position_imgs"] = self.encode_images(cached_condition["position_imgs"])
|
169 |
+
|
170 |
+
if 'camera_info_gen' in cached_condition:
|
171 |
+
camera_info = cached_condition['camera_info_gen'] # B,N
|
172 |
+
if isinstance(camera_info, List):
|
173 |
+
camera_info = torch.tensor(camera_info)
|
174 |
+
camera_info = camera_info.to(image_vae.device).to(torch.int64)
|
175 |
+
cached_condition['camera_info_gen'] = camera_info
|
176 |
+
if 'camera_info_ref' in cached_condition:
|
177 |
+
camera_info = cached_condition['camera_info_ref'] # B,N
|
178 |
+
if isinstance(camera_info, List):
|
179 |
+
camera_info = torch.tensor(camera_info)
|
180 |
+
camera_info = camera_info.to(image_vae.device).to(torch.int64)
|
181 |
+
cached_condition['camera_info_ref'] = camera_info
|
182 |
+
|
183 |
+
cached_condition['ref_latents'] = ref_latents
|
184 |
+
|
185 |
+
if guidance_scale > 1:
|
186 |
+
negative_ref_latents = torch.zeros_like(cached_condition['ref_latents'])
|
187 |
+
cached_condition['ref_latents'] = torch.cat([negative_ref_latents, cached_condition['ref_latents']])
|
188 |
+
cached_condition['ref_scale'] = torch.as_tensor([0.0, 1.0]).to(cached_condition['ref_latents'])
|
189 |
+
if "normal_imgs" in cached_condition:
|
190 |
+
cached_condition['normal_imgs'] = torch.cat(
|
191 |
+
(cached_condition['normal_imgs'], cached_condition['normal_imgs']))
|
192 |
+
|
193 |
+
if "position_imgs" in cached_condition:
|
194 |
+
cached_condition['position_imgs'] = torch.cat(
|
195 |
+
(cached_condition['position_imgs'], cached_condition['position_imgs']))
|
196 |
+
|
197 |
+
if 'position_maps' in cached_condition:
|
198 |
+
cached_condition['position_maps'] = torch.cat(
|
199 |
+
(cached_condition['position_maps'], cached_condition['position_maps']))
|
200 |
+
|
201 |
+
if 'camera_info_gen' in cached_condition:
|
202 |
+
cached_condition['camera_info_gen'] = torch.cat(
|
203 |
+
(cached_condition['camera_info_gen'], cached_condition['camera_info_gen']))
|
204 |
+
if 'camera_info_ref' in cached_condition:
|
205 |
+
cached_condition['camera_info_ref'] = torch.cat(
|
206 |
+
(cached_condition['camera_info_ref'], cached_condition['camera_info_ref']))
|
207 |
+
|
208 |
+
prompt_embeds = self.unet.learned_text_clip_gen.repeat(num_images_per_prompt, 1, 1)
|
209 |
+
negative_prompt_embeds = torch.zeros_like(prompt_embeds)
|
210 |
+
|
211 |
+
latents: torch.Tensor = self.denoise(
|
212 |
+
None,
|
213 |
+
*args,
|
214 |
+
cross_attention_kwargs=None,
|
215 |
+
guidance_scale=guidance_scale,
|
216 |
+
num_images_per_prompt=num_images_per_prompt,
|
217 |
+
prompt_embeds=prompt_embeds,
|
218 |
+
negative_prompt_embeds=negative_prompt_embeds,
|
219 |
+
num_inference_steps=num_inference_steps,
|
220 |
+
output_type='latent',
|
221 |
+
width=width,
|
222 |
+
height=height,
|
223 |
+
**cached_condition
|
224 |
+
).images
|
225 |
+
|
226 |
+
if not output_type == "latent":
|
227 |
+
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
|
228 |
+
else:
|
229 |
+
image = latents
|
230 |
+
|
231 |
+
image = self.image_processor.postprocess(image, output_type=output_type)
|
232 |
+
if not return_dict:
|
233 |
+
return (image,)
|
234 |
+
|
235 |
+
return ImagePipelineOutput(images=image)
|
236 |
+
|
237 |
+
def denoise(
|
238 |
+
self,
|
239 |
+
prompt: Union[str, List[str]] = None,
|
240 |
+
height: Optional[int] = None,
|
241 |
+
width: Optional[int] = None,
|
242 |
+
num_inference_steps: int = 50,
|
243 |
+
timesteps: List[int] = None,
|
244 |
+
sigmas: List[float] = None,
|
245 |
+
guidance_scale: float = 7.5,
|
246 |
+
negative_prompt: Optional[Union[str, List[str]]] = None,
|
247 |
+
num_images_per_prompt: Optional[int] = 1,
|
248 |
+
eta: float = 0.0,
|
249 |
+
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
250 |
+
latents: Optional[torch.Tensor] = None,
|
251 |
+
prompt_embeds: Optional[torch.Tensor] = None,
|
252 |
+
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
253 |
+
ip_adapter_image: Optional[PipelineImageInput] = None,
|
254 |
+
ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
|
255 |
+
output_type: Optional[str] = "pil",
|
256 |
+
return_dict: bool = True,
|
257 |
+
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
258 |
+
guidance_rescale: float = 0.0,
|
259 |
+
clip_skip: Optional[int] = None,
|
260 |
+
callback_on_step_end: Optional[
|
261 |
+
Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
|
262 |
+
] = None,
|
263 |
+
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
264 |
+
**kwargs,
|
265 |
+
):
|
266 |
+
r"""
|
267 |
+
The call function to the pipeline for generation.
|
268 |
+
|
269 |
+
Args:
|
270 |
+
prompt (`str` or `List[str]`, *optional*):
|
271 |
+
The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
|
272 |
+
height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
|
273 |
+
The height in pixels of the generated image.
|
274 |
+
width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
|
275 |
+
The width in pixels of the generated image.
|
276 |
+
num_inference_steps (`int`, *optional*, defaults to 50):
|
277 |
+
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
278 |
+
expense of slower inference.
|
279 |
+
timesteps (`List[int]`, *optional*):
|
280 |
+
Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
|
281 |
+
in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
|
282 |
+
passed will be used. Must be in descending order.
|
283 |
+
sigmas (`List[float]`, *optional*):
|
284 |
+
Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
|
285 |
+
their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
|
286 |
+
will be used.
|
287 |
+
guidance_scale (`float`, *optional*, defaults to 7.5):
|
288 |
+
A higher guidance scale value encourages the model to generate images closely linked to the text
|
289 |
+
`prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
|
290 |
+
negative_prompt (`str` or `List[str]`, *optional*):
|
291 |
+
The prompt or prompts to guide what to not include in image generation. If not defined, you need to
|
292 |
+
pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
|
293 |
+
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
294 |
+
The number of images to generate per prompt.
|
295 |
+
eta (`float`, *optional*, defaults to 0.0):
|
296 |
+
Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
|
297 |
+
to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
|
298 |
+
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
299 |
+
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
|
300 |
+
generation deterministic.
|
301 |
+
latents (`torch.Tensor`, *optional*):
|
302 |
+
Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
|
303 |
+
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
304 |
+
tensor is generated by sampling using the supplied random `generator`.
|
305 |
+
prompt_embeds (`torch.Tensor`, *optional*):
|
306 |
+
Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
|
307 |
+
provided, text embeddings are generated from the `prompt` input argument.
|
308 |
+
negative_prompt_embeds (`torch.Tensor`, *optional*):
|
309 |
+
Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
|
310 |
+
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
|
311 |
+
ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
|
312 |
+
ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
|
313 |
+
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
|
314 |
+
IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
|
315 |
+
contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
|
316 |
+
provided, embeddings are computed from the `ip_adapter_image` input argument.
|
317 |
+
output_type (`str`, *optional*, defaults to `"pil"`):
|
318 |
+
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
|
319 |
+
return_dict (`bool`, *optional*, defaults to `True`):
|
320 |
+
Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
|
321 |
+
plain tuple.
|
322 |
+
cross_attention_kwargs (`dict`, *optional*):
|
323 |
+
A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
|
324 |
+
[`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
|
325 |
+
guidance_rescale (`float`, *optional*, defaults to 0.0):
|
326 |
+
Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
|
327 |
+
Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
|
328 |
+
using zero terminal SNR.
|
329 |
+
clip_skip (`int`, *optional*):
|
330 |
+
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
|
331 |
+
the output of the pre-final layer will be used for computing the prompt embeddings.
|
332 |
+
callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
|
333 |
+
A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
|
334 |
+
each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
|
335 |
+
DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
|
336 |
+
list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
|
337 |
+
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
338 |
+
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
339 |
+
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
340 |
+
`._callback_tensor_inputs` attribute of your pipeline class.
|
341 |
+
|
342 |
+
Examples:
|
343 |
+
|
344 |
+
Returns:
|
345 |
+
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
|
346 |
+
If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
|
347 |
+
otherwise a `tuple` is returned where the first element is a list with the generated images and the
|
348 |
+
second element is a list of `bool`s indicating whether the corresponding generated image contains
|
349 |
+
"not-safe-for-work" (nsfw) content.
|
350 |
+
"""
|
351 |
+
|
352 |
+
callback = kwargs.pop("callback", None)
|
353 |
+
callback_steps = kwargs.pop("callback_steps", None)
|
354 |
+
|
355 |
+
if callback is not None:
|
356 |
+
deprecate(
|
357 |
+
"callback",
|
358 |
+
"1.0.0",
|
359 |
+
"Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
|
360 |
+
)
|
361 |
+
if callback_steps is not None:
|
362 |
+
deprecate(
|
363 |
+
"callback_steps",
|
364 |
+
"1.0.0",
|
365 |
+
"Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
|
366 |
+
)
|
367 |
+
|
368 |
+
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
|
369 |
+
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
370 |
+
|
371 |
+
# 0. Default height and width to unet
|
372 |
+
height = height or self.unet.config.sample_size * self.vae_scale_factor
|
373 |
+
width = width or self.unet.config.sample_size * self.vae_scale_factor
|
374 |
+
# to deal with lora scaling and other possible forward hooks
|
375 |
+
|
376 |
+
# 1. Check inputs. Raise error if not correct
|
377 |
+
self.check_inputs(
|
378 |
+
prompt,
|
379 |
+
height,
|
380 |
+
width,
|
381 |
+
callback_steps,
|
382 |
+
negative_prompt,
|
383 |
+
prompt_embeds,
|
384 |
+
negative_prompt_embeds,
|
385 |
+
ip_adapter_image,
|
386 |
+
ip_adapter_image_embeds,
|
387 |
+
callback_on_step_end_tensor_inputs,
|
388 |
+
)
|
389 |
+
|
390 |
+
self._guidance_scale = guidance_scale
|
391 |
+
self._guidance_rescale = guidance_rescale
|
392 |
+
self._clip_skip = clip_skip
|
393 |
+
self._cross_attention_kwargs = cross_attention_kwargs
|
394 |
+
self._interrupt = False
|
395 |
+
|
396 |
+
# 2. Define call parameters
|
397 |
+
if prompt is not None and isinstance(prompt, str):
|
398 |
+
batch_size = 1
|
399 |
+
elif prompt is not None and isinstance(prompt, list):
|
400 |
+
batch_size = len(prompt)
|
401 |
+
else:
|
402 |
+
batch_size = prompt_embeds.shape[0]
|
403 |
+
|
404 |
+
device = self._execution_device
|
405 |
+
|
406 |
+
# 3. Encode input prompt
|
407 |
+
lora_scale = (
|
408 |
+
self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
|
409 |
+
)
|
410 |
+
|
411 |
+
prompt_embeds, negative_prompt_embeds = self.encode_prompt(
|
412 |
+
prompt,
|
413 |
+
device,
|
414 |
+
num_images_per_prompt,
|
415 |
+
self.do_classifier_free_guidance,
|
416 |
+
negative_prompt,
|
417 |
+
prompt_embeds=prompt_embeds,
|
418 |
+
negative_prompt_embeds=negative_prompt_embeds,
|
419 |
+
lora_scale=lora_scale,
|
420 |
+
clip_skip=self.clip_skip,
|
421 |
+
)
|
422 |
+
|
423 |
+
# For classifier free guidance, we need to do two forward passes.
|
424 |
+
# Here we concatenate the unconditional and text embeddings into a single batch
|
425 |
+
# to avoid doing two forward passes
|
426 |
+
if self.do_classifier_free_guidance:
|
427 |
+
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
|
428 |
+
|
429 |
+
if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
|
430 |
+
image_embeds = self.prepare_ip_adapter_image_embeds(
|
431 |
+
ip_adapter_image,
|
432 |
+
ip_adapter_image_embeds,
|
433 |
+
device,
|
434 |
+
batch_size * num_images_per_prompt,
|
435 |
+
self.do_classifier_free_guidance,
|
436 |
+
)
|
437 |
+
|
438 |
+
# 4. Prepare timesteps
|
439 |
+
timesteps, num_inference_steps = retrieve_timesteps(
|
440 |
+
self.scheduler, num_inference_steps, device, timesteps, sigmas
|
441 |
+
)
|
442 |
+
assert num_images_per_prompt == 1
|
443 |
+
# 5. Prepare latent variables
|
444 |
+
num_channels_latents = self.unet.config.in_channels
|
445 |
+
latents = self.prepare_latents(
|
446 |
+
batch_size * kwargs['num_in_batch'], # num_images_per_prompt,
|
447 |
+
num_channels_latents,
|
448 |
+
height,
|
449 |
+
width,
|
450 |
+
prompt_embeds.dtype,
|
451 |
+
device,
|
452 |
+
generator,
|
453 |
+
latents,
|
454 |
+
)
|
455 |
+
|
456 |
+
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
|
457 |
+
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
458 |
+
|
459 |
+
# 6.1 Add image embeds for IP-Adapter
|
460 |
+
added_cond_kwargs = (
|
461 |
+
{"image_embeds": image_embeds}
|
462 |
+
if (ip_adapter_image is not None or ip_adapter_image_embeds is not None)
|
463 |
+
else None
|
464 |
+
)
|
465 |
+
|
466 |
+
# 6.2 Optionally get Guidance Scale Embedding
|
467 |
+
timestep_cond = None
|
468 |
+
if self.unet.config.time_cond_proj_dim is not None:
|
469 |
+
guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
|
470 |
+
timestep_cond = self.get_guidance_scale_embedding(
|
471 |
+
guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
|
472 |
+
).to(device=device, dtype=latents.dtype)
|
473 |
+
|
474 |
+
# 7. Denoising loop
|
475 |
+
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
476 |
+
self._num_timesteps = len(timesteps)
|
477 |
+
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
478 |
+
for i, t in enumerate(timesteps):
|
479 |
+
if self.interrupt:
|
480 |
+
continue
|
481 |
+
|
482 |
+
# expand the latents if we are doing classifier free guidance
|
483 |
+
latents = rearrange(latents, '(b n) c h w -> b n c h w', n=kwargs['num_in_batch'])
|
484 |
+
latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
|
485 |
+
latent_model_input = rearrange(latent_model_input, 'b n c h w -> (b n) c h w')
|
486 |
+
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
487 |
+
latent_model_input = rearrange(latent_model_input, '(b n) c h w ->b n c h w', n=kwargs['num_in_batch'])
|
488 |
+
|
489 |
+
# predict the noise residual
|
490 |
+
|
491 |
+
noise_pred = self.unet(
|
492 |
+
latent_model_input,
|
493 |
+
t,
|
494 |
+
encoder_hidden_states=prompt_embeds,
|
495 |
+
timestep_cond=timestep_cond,
|
496 |
+
cross_attention_kwargs=self.cross_attention_kwargs,
|
497 |
+
added_cond_kwargs=added_cond_kwargs,
|
498 |
+
return_dict=False, **kwargs
|
499 |
+
)[0]
|
500 |
+
latents = rearrange(latents, 'b n c h w -> (b n) c h w')
|
501 |
+
# perform guidance
|
502 |
+
if self.do_classifier_free_guidance:
|
503 |
+
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
504 |
+
noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
|
505 |
+
|
506 |
+
if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
|
507 |
+
# Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
|
508 |
+
noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
|
509 |
+
|
510 |
+
# compute the previous noisy sample x_t -> x_t-1
|
511 |
+
latents = \
|
512 |
+
self.scheduler.step(noise_pred, t, latents[:, :num_channels_latents, :, :], **extra_step_kwargs,
|
513 |
+
return_dict=False)[0]
|
514 |
+
|
515 |
+
if callback_on_step_end is not None:
|
516 |
+
callback_kwargs = {}
|
517 |
+
for k in callback_on_step_end_tensor_inputs:
|
518 |
+
callback_kwargs[k] = locals()[k]
|
519 |
+
callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
|
520 |
+
|
521 |
+
latents = callback_outputs.pop("latents", latents)
|
522 |
+
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
|
523 |
+
negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
|
524 |
+
|
525 |
+
# call the callback, if provided
|
526 |
+
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
527 |
+
progress_bar.update()
|
528 |
+
if callback is not None and i % callback_steps == 0:
|
529 |
+
step_idx = i // getattr(self.scheduler, "order", 1)
|
530 |
+
callback(step_idx, t, latents)
|
531 |
+
|
532 |
+
if not output_type == "latent":
|
533 |
+
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
|
534 |
+
0
|
535 |
+
]
|
536 |
+
image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
|
537 |
+
else:
|
538 |
+
image = latents
|
539 |
+
has_nsfw_concept = None
|
540 |
+
|
541 |
+
if has_nsfw_concept is None:
|
542 |
+
do_denormalize = [True] * image.shape[0]
|
543 |
+
else:
|
544 |
+
do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
|
545 |
+
|
546 |
+
image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
|
547 |
+
|
548 |
+
# Offload all models
|
549 |
+
self.maybe_free_model_hooks()
|
550 |
+
|
551 |
+
if not return_dict:
|
552 |
+
return (image, has_nsfw_concept)
|
553 |
+
|
554 |
+
return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
|
build/lib/hy3dgen/texgen/hunyuanpaint/unet/__init__.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Open Source Model Licensed under the Apache License Version 2.0
|
2 |
+
# and Other Licenses of the Third-Party Components therein:
|
3 |
+
# The below Model in this distribution may have been modified by THL A29 Limited
|
4 |
+
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
|
5 |
+
|
6 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
7 |
+
# The below software and/or models in this distribution may have been
|
8 |
+
# modified by THL A29 Limited ("Tencent Modifications").
|
9 |
+
# All Tencent Modifications are Copyright (C) THL A29 Limited.
|
10 |
+
|
11 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
12 |
+
# except for the third-party components listed below.
|
13 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
14 |
+
# in the repsective licenses of these third-party components.
|
15 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
16 |
+
# components and must ensure that the usage of the third party components adheres to
|
17 |
+
# all relevant laws and regulations.
|
18 |
+
|
19 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
20 |
+
# their software and algorithms, including trained model weights, parameters (including
|
21 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
22 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
23 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
build/lib/hy3dgen/texgen/hunyuanpaint/unet/modules.py
ADDED
@@ -0,0 +1,440 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Open Source Model Licensed under the Apache License Version 2.0
|
2 |
+
# and Other Licenses of the Third-Party Components therein:
|
3 |
+
# The below Model in this distribution may have been modified by THL A29 Limited
|
4 |
+
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
|
5 |
+
|
6 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
7 |
+
# The below software and/or models in this distribution may have been
|
8 |
+
# modified by THL A29 Limited ("Tencent Modifications").
|
9 |
+
# All Tencent Modifications are Copyright (C) THL A29 Limited.
|
10 |
+
|
11 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
12 |
+
# except for the third-party components listed below.
|
13 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
14 |
+
# in the repsective licenses of these third-party components.
|
15 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
16 |
+
# components and must ensure that the usage of the third party components adheres to
|
17 |
+
# all relevant laws and regulations.
|
18 |
+
|
19 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
20 |
+
# their software and algorithms, including trained model weights, parameters (including
|
21 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
22 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
23 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
24 |
+
|
25 |
+
|
26 |
+
import copy
|
27 |
+
import json
|
28 |
+
import os
|
29 |
+
from typing import Any, Dict, Optional
|
30 |
+
|
31 |
+
import torch
|
32 |
+
import torch.nn as nn
|
33 |
+
from diffusers.models import UNet2DConditionModel
|
34 |
+
from diffusers.models.attention_processor import Attention
|
35 |
+
from diffusers.models.transformers.transformer_2d import BasicTransformerBlock
|
36 |
+
from einops import rearrange
|
37 |
+
|
38 |
+
|
39 |
+
def _chunked_feed_forward(ff: nn.Module, hidden_states: torch.Tensor, chunk_dim: int, chunk_size: int):
|
40 |
+
# "feed_forward_chunk_size" can be used to save memory
|
41 |
+
if hidden_states.shape[chunk_dim] % chunk_size != 0:
|
42 |
+
raise ValueError(
|
43 |
+
f"`hidden_states` dimension to be chunked: {hidden_states.shape[chunk_dim]} has to be divisible by chunk size: {chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`."
|
44 |
+
)
|
45 |
+
|
46 |
+
num_chunks = hidden_states.shape[chunk_dim] // chunk_size
|
47 |
+
ff_output = torch.cat(
|
48 |
+
[ff(hid_slice) for hid_slice in hidden_states.chunk(num_chunks, dim=chunk_dim)],
|
49 |
+
dim=chunk_dim,
|
50 |
+
)
|
51 |
+
return ff_output
|
52 |
+
|
53 |
+
|
54 |
+
class Basic2p5DTransformerBlock(torch.nn.Module):
|
55 |
+
def __init__(self, transformer: BasicTransformerBlock, layer_name, use_ma=True, use_ra=True) -> None:
|
56 |
+
super().__init__()
|
57 |
+
self.transformer = transformer
|
58 |
+
self.layer_name = layer_name
|
59 |
+
self.use_ma = use_ma
|
60 |
+
self.use_ra = use_ra
|
61 |
+
|
62 |
+
# multiview attn
|
63 |
+
if self.use_ma:
|
64 |
+
self.attn_multiview = Attention(
|
65 |
+
query_dim=self.dim,
|
66 |
+
heads=self.num_attention_heads,
|
67 |
+
dim_head=self.attention_head_dim,
|
68 |
+
dropout=self.dropout,
|
69 |
+
bias=self.attention_bias,
|
70 |
+
cross_attention_dim=None,
|
71 |
+
upcast_attention=self.attn1.upcast_attention,
|
72 |
+
out_bias=True,
|
73 |
+
)
|
74 |
+
|
75 |
+
# ref attn
|
76 |
+
if self.use_ra:
|
77 |
+
self.attn_refview = Attention(
|
78 |
+
query_dim=self.dim,
|
79 |
+
heads=self.num_attention_heads,
|
80 |
+
dim_head=self.attention_head_dim,
|
81 |
+
dropout=self.dropout,
|
82 |
+
bias=self.attention_bias,
|
83 |
+
cross_attention_dim=None,
|
84 |
+
upcast_attention=self.attn1.upcast_attention,
|
85 |
+
out_bias=True,
|
86 |
+
)
|
87 |
+
|
88 |
+
def __getattr__(self, name: str):
|
89 |
+
try:
|
90 |
+
return super().__getattr__(name)
|
91 |
+
except AttributeError:
|
92 |
+
return getattr(self.transformer, name)
|
93 |
+
|
94 |
+
def forward(
|
95 |
+
self,
|
96 |
+
hidden_states: torch.Tensor,
|
97 |
+
attention_mask: Optional[torch.Tensor] = None,
|
98 |
+
encoder_hidden_states: Optional[torch.Tensor] = None,
|
99 |
+
encoder_attention_mask: Optional[torch.Tensor] = None,
|
100 |
+
timestep: Optional[torch.LongTensor] = None,
|
101 |
+
cross_attention_kwargs: Dict[str, Any] = None,
|
102 |
+
class_labels: Optional[torch.LongTensor] = None,
|
103 |
+
added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
|
104 |
+
) -> torch.Tensor:
|
105 |
+
|
106 |
+
# Notice that normalization is always applied before the real computation in the following blocks.
|
107 |
+
# 0. Self-Attention
|
108 |
+
batch_size = hidden_states.shape[0]
|
109 |
+
|
110 |
+
cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
|
111 |
+
num_in_batch = cross_attention_kwargs.pop('num_in_batch', 1)
|
112 |
+
mode = cross_attention_kwargs.pop('mode', None)
|
113 |
+
mva_scale = cross_attention_kwargs.pop('mva_scale', 1.0)
|
114 |
+
ref_scale = cross_attention_kwargs.pop('ref_scale', 1.0)
|
115 |
+
condition_embed_dict = cross_attention_kwargs.pop("condition_embed_dict", None)
|
116 |
+
|
117 |
+
if self.norm_type == "ada_norm":
|
118 |
+
norm_hidden_states = self.norm1(hidden_states, timestep)
|
119 |
+
elif self.norm_type == "ada_norm_zero":
|
120 |
+
norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
|
121 |
+
hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
|
122 |
+
)
|
123 |
+
elif self.norm_type in ["layer_norm", "layer_norm_i2vgen"]:
|
124 |
+
norm_hidden_states = self.norm1(hidden_states)
|
125 |
+
elif self.norm_type == "ada_norm_continuous":
|
126 |
+
norm_hidden_states = self.norm1(hidden_states, added_cond_kwargs["pooled_text_emb"])
|
127 |
+
elif self.norm_type == "ada_norm_single":
|
128 |
+
shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
|
129 |
+
self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1)
|
130 |
+
).chunk(6, dim=1)
|
131 |
+
norm_hidden_states = self.norm1(hidden_states)
|
132 |
+
norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
|
133 |
+
else:
|
134 |
+
raise ValueError("Incorrect norm used")
|
135 |
+
|
136 |
+
if self.pos_embed is not None:
|
137 |
+
norm_hidden_states = self.pos_embed(norm_hidden_states)
|
138 |
+
|
139 |
+
# 1. Prepare GLIGEN inputs
|
140 |
+
cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
|
141 |
+
gligen_kwargs = cross_attention_kwargs.pop("gligen", None)
|
142 |
+
|
143 |
+
attn_output = self.attn1(
|
144 |
+
norm_hidden_states,
|
145 |
+
encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
|
146 |
+
attention_mask=attention_mask,
|
147 |
+
**cross_attention_kwargs,
|
148 |
+
)
|
149 |
+
|
150 |
+
if self.norm_type == "ada_norm_zero":
|
151 |
+
attn_output = gate_msa.unsqueeze(1) * attn_output
|
152 |
+
elif self.norm_type == "ada_norm_single":
|
153 |
+
attn_output = gate_msa * attn_output
|
154 |
+
|
155 |
+
hidden_states = attn_output + hidden_states
|
156 |
+
if hidden_states.ndim == 4:
|
157 |
+
hidden_states = hidden_states.squeeze(1)
|
158 |
+
|
159 |
+
# 1.2 Reference Attention
|
160 |
+
if 'w' in mode:
|
161 |
+
condition_embed_dict[self.layer_name] = rearrange(norm_hidden_states, '(b n) l c -> b (n l) c',
|
162 |
+
n=num_in_batch) # B, (N L), C
|
163 |
+
|
164 |
+
if 'r' in mode and self.use_ra:
|
165 |
+
condition_embed = condition_embed_dict[self.layer_name].unsqueeze(1).repeat(1, num_in_batch, 1,
|
166 |
+
1) # B N L C
|
167 |
+
condition_embed = rearrange(condition_embed, 'b n l c -> (b n) l c')
|
168 |
+
|
169 |
+
attn_output = self.attn_refview(
|
170 |
+
norm_hidden_states,
|
171 |
+
encoder_hidden_states=condition_embed,
|
172 |
+
attention_mask=None,
|
173 |
+
**cross_attention_kwargs
|
174 |
+
)
|
175 |
+
ref_scale_timing = ref_scale
|
176 |
+
if isinstance(ref_scale, torch.Tensor):
|
177 |
+
ref_scale_timing = ref_scale.unsqueeze(1).repeat(1, num_in_batch).view(-1)
|
178 |
+
for _ in range(attn_output.ndim - 1):
|
179 |
+
ref_scale_timing = ref_scale_timing.unsqueeze(-1)
|
180 |
+
hidden_states = ref_scale_timing * attn_output + hidden_states
|
181 |
+
if hidden_states.ndim == 4:
|
182 |
+
hidden_states = hidden_states.squeeze(1)
|
183 |
+
|
184 |
+
# 1.3 Multiview Attention
|
185 |
+
if num_in_batch > 1 and self.use_ma:
|
186 |
+
multivew_hidden_states = rearrange(norm_hidden_states, '(b n) l c -> b (n l) c', n=num_in_batch)
|
187 |
+
|
188 |
+
attn_output = self.attn_multiview(
|
189 |
+
multivew_hidden_states,
|
190 |
+
encoder_hidden_states=multivew_hidden_states,
|
191 |
+
**cross_attention_kwargs
|
192 |
+
)
|
193 |
+
|
194 |
+
attn_output = rearrange(attn_output, 'b (n l) c -> (b n) l c', n=num_in_batch)
|
195 |
+
|
196 |
+
hidden_states = mva_scale * attn_output + hidden_states
|
197 |
+
if hidden_states.ndim == 4:
|
198 |
+
hidden_states = hidden_states.squeeze(1)
|
199 |
+
|
200 |
+
# 1.2 GLIGEN Control
|
201 |
+
if gligen_kwargs is not None:
|
202 |
+
hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])
|
203 |
+
|
204 |
+
# 3. Cross-Attention
|
205 |
+
if self.attn2 is not None:
|
206 |
+
if self.norm_type == "ada_norm":
|
207 |
+
norm_hidden_states = self.norm2(hidden_states, timestep)
|
208 |
+
elif self.norm_type in ["ada_norm_zero", "layer_norm", "layer_norm_i2vgen"]:
|
209 |
+
norm_hidden_states = self.norm2(hidden_states)
|
210 |
+
elif self.norm_type == "ada_norm_single":
|
211 |
+
# For PixArt norm2 isn't applied here:
|
212 |
+
# https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
|
213 |
+
norm_hidden_states = hidden_states
|
214 |
+
elif self.norm_type == "ada_norm_continuous":
|
215 |
+
norm_hidden_states = self.norm2(hidden_states, added_cond_kwargs["pooled_text_emb"])
|
216 |
+
else:
|
217 |
+
raise ValueError("Incorrect norm")
|
218 |
+
|
219 |
+
if self.pos_embed is not None and self.norm_type != "ada_norm_single":
|
220 |
+
norm_hidden_states = self.pos_embed(norm_hidden_states)
|
221 |
+
|
222 |
+
attn_output = self.attn2(
|
223 |
+
norm_hidden_states,
|
224 |
+
encoder_hidden_states=encoder_hidden_states,
|
225 |
+
attention_mask=encoder_attention_mask,
|
226 |
+
**cross_attention_kwargs,
|
227 |
+
)
|
228 |
+
|
229 |
+
hidden_states = attn_output + hidden_states
|
230 |
+
|
231 |
+
# 4. Feed-forward
|
232 |
+
# i2vgen doesn't have this norm 🤷♂️
|
233 |
+
if self.norm_type == "ada_norm_continuous":
|
234 |
+
norm_hidden_states = self.norm3(hidden_states, added_cond_kwargs["pooled_text_emb"])
|
235 |
+
elif not self.norm_type == "ada_norm_single":
|
236 |
+
norm_hidden_states = self.norm3(hidden_states)
|
237 |
+
|
238 |
+
if self.norm_type == "ada_norm_zero":
|
239 |
+
norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
|
240 |
+
|
241 |
+
if self.norm_type == "ada_norm_single":
|
242 |
+
norm_hidden_states = self.norm2(hidden_states)
|
243 |
+
norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
|
244 |
+
|
245 |
+
if self._chunk_size is not None:
|
246 |
+
# "feed_forward_chunk_size" can be used to save memory
|
247 |
+
ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
|
248 |
+
else:
|
249 |
+
ff_output = self.ff(norm_hidden_states)
|
250 |
+
|
251 |
+
if self.norm_type == "ada_norm_zero":
|
252 |
+
ff_output = gate_mlp.unsqueeze(1) * ff_output
|
253 |
+
elif self.norm_type == "ada_norm_single":
|
254 |
+
ff_output = gate_mlp * ff_output
|
255 |
+
|
256 |
+
hidden_states = ff_output + hidden_states
|
257 |
+
if hidden_states.ndim == 4:
|
258 |
+
hidden_states = hidden_states.squeeze(1)
|
259 |
+
|
260 |
+
return hidden_states
|
261 |
+
|
262 |
+
|
263 |
+
class UNet2p5DConditionModel(torch.nn.Module):
|
264 |
+
def __init__(self, unet: UNet2DConditionModel) -> None:
|
265 |
+
super().__init__()
|
266 |
+
self.unet = unet
|
267 |
+
|
268 |
+
self.use_ma = True
|
269 |
+
self.use_ra = True
|
270 |
+
self.use_camera_embedding = True
|
271 |
+
self.use_dual_stream = True
|
272 |
+
|
273 |
+
if self.use_dual_stream:
|
274 |
+
self.unet_dual = copy.deepcopy(unet)
|
275 |
+
self.init_attention(self.unet_dual)
|
276 |
+
self.init_attention(self.unet, use_ma=self.use_ma, use_ra=self.use_ra)
|
277 |
+
self.init_condition()
|
278 |
+
self.init_camera_embedding()
|
279 |
+
|
280 |
+
@staticmethod
|
281 |
+
def from_pretrained(pretrained_model_name_or_path, **kwargs):
|
282 |
+
torch_dtype = kwargs.pop('torch_dtype', torch.float32)
|
283 |
+
config_path = os.path.join(pretrained_model_name_or_path, 'config.json')
|
284 |
+
unet_ckpt_path = os.path.join(pretrained_model_name_or_path, 'diffusion_pytorch_model.bin')
|
285 |
+
with open(config_path, 'r', encoding='utf-8') as file:
|
286 |
+
config = json.load(file)
|
287 |
+
unet = UNet2DConditionModel(**config)
|
288 |
+
unet = UNet2p5DConditionModel(unet)
|
289 |
+
unet_ckpt = torch.load(unet_ckpt_path, map_location='cpu', weights_only=True)
|
290 |
+
unet.load_state_dict(unet_ckpt, strict=True)
|
291 |
+
unet = unet.to(torch_dtype)
|
292 |
+
return unet
|
293 |
+
|
294 |
+
def init_condition(self):
|
295 |
+
self.unet.conv_in = torch.nn.Conv2d(
|
296 |
+
12,
|
297 |
+
self.unet.conv_in.out_channels,
|
298 |
+
kernel_size=self.unet.conv_in.kernel_size,
|
299 |
+
stride=self.unet.conv_in.stride,
|
300 |
+
padding=self.unet.conv_in.padding,
|
301 |
+
dilation=self.unet.conv_in.dilation,
|
302 |
+
groups=self.unet.conv_in.groups,
|
303 |
+
bias=self.unet.conv_in.bias is not None)
|
304 |
+
|
305 |
+
self.unet.learned_text_clip_gen = nn.Parameter(torch.randn(1, 77, 1024))
|
306 |
+
self.unet.learned_text_clip_ref = nn.Parameter(torch.randn(1, 77, 1024))
|
307 |
+
|
308 |
+
def init_camera_embedding(self):
|
309 |
+
|
310 |
+
if self.use_camera_embedding:
|
311 |
+
time_embed_dim = 1280
|
312 |
+
self.max_num_ref_image = 5
|
313 |
+
self.max_num_gen_image = 12 * 3 + 4 * 2
|
314 |
+
self.unet.class_embedding = nn.Embedding(self.max_num_ref_image + self.max_num_gen_image, time_embed_dim)
|
315 |
+
|
316 |
+
def init_attention(self, unet, use_ma=False, use_ra=False):
|
317 |
+
|
318 |
+
for down_block_i, down_block in enumerate(unet.down_blocks):
|
319 |
+
if hasattr(down_block, "has_cross_attention") and down_block.has_cross_attention:
|
320 |
+
for attn_i, attn in enumerate(down_block.attentions):
|
321 |
+
for transformer_i, transformer in enumerate(attn.transformer_blocks):
|
322 |
+
if isinstance(transformer, BasicTransformerBlock):
|
323 |
+
attn.transformer_blocks[transformer_i] = Basic2p5DTransformerBlock(transformer,
|
324 |
+
f'down_{down_block_i}_{attn_i}_{transformer_i}',
|
325 |
+
use_ma, use_ra)
|
326 |
+
|
327 |
+
if hasattr(unet.mid_block, "has_cross_attention") and unet.mid_block.has_cross_attention:
|
328 |
+
for attn_i, attn in enumerate(unet.mid_block.attentions):
|
329 |
+
for transformer_i, transformer in enumerate(attn.transformer_blocks):
|
330 |
+
if isinstance(transformer, BasicTransformerBlock):
|
331 |
+
attn.transformer_blocks[transformer_i] = Basic2p5DTransformerBlock(transformer,
|
332 |
+
f'mid_{attn_i}_{transformer_i}',
|
333 |
+
use_ma, use_ra)
|
334 |
+
|
335 |
+
for up_block_i, up_block in enumerate(unet.up_blocks):
|
336 |
+
if hasattr(up_block, "has_cross_attention") and up_block.has_cross_attention:
|
337 |
+
for attn_i, attn in enumerate(up_block.attentions):
|
338 |
+
for transformer_i, transformer in enumerate(attn.transformer_blocks):
|
339 |
+
if isinstance(transformer, BasicTransformerBlock):
|
340 |
+
attn.transformer_blocks[transformer_i] = Basic2p5DTransformerBlock(transformer,
|
341 |
+
f'up_{up_block_i}_{attn_i}_{transformer_i}',
|
342 |
+
use_ma, use_ra)
|
343 |
+
|
344 |
+
def __getattr__(self, name: str):
|
345 |
+
try:
|
346 |
+
return super().__getattr__(name)
|
347 |
+
except AttributeError:
|
348 |
+
return getattr(self.unet, name)
|
349 |
+
|
350 |
+
def forward(
|
351 |
+
self, sample, timestep, encoder_hidden_states,
|
352 |
+
*args, down_intrablock_additional_residuals=None,
|
353 |
+
down_block_res_samples=None, mid_block_res_sample=None,
|
354 |
+
**cached_condition,
|
355 |
+
):
|
356 |
+
B, N_gen, _, H, W = sample.shape
|
357 |
+
assert H == W
|
358 |
+
|
359 |
+
if self.use_camera_embedding:
|
360 |
+
camera_info_gen = cached_condition['camera_info_gen'] + self.max_num_ref_image
|
361 |
+
camera_info_gen = rearrange(camera_info_gen, 'b n -> (b n)')
|
362 |
+
else:
|
363 |
+
camera_info_gen = None
|
364 |
+
|
365 |
+
sample = [sample]
|
366 |
+
if 'normal_imgs' in cached_condition:
|
367 |
+
sample.append(cached_condition["normal_imgs"])
|
368 |
+
if 'position_imgs' in cached_condition:
|
369 |
+
sample.append(cached_condition["position_imgs"])
|
370 |
+
sample = torch.cat(sample, dim=2)
|
371 |
+
|
372 |
+
sample = rearrange(sample, 'b n c h w -> (b n) c h w')
|
373 |
+
|
374 |
+
encoder_hidden_states_gen = encoder_hidden_states.unsqueeze(1).repeat(1, N_gen, 1, 1)
|
375 |
+
encoder_hidden_states_gen = rearrange(encoder_hidden_states_gen, 'b n l c -> (b n) l c')
|
376 |
+
|
377 |
+
if self.use_ra:
|
378 |
+
if 'condition_embed_dict' in cached_condition:
|
379 |
+
condition_embed_dict = cached_condition['condition_embed_dict']
|
380 |
+
else:
|
381 |
+
condition_embed_dict = {}
|
382 |
+
ref_latents = cached_condition['ref_latents']
|
383 |
+
N_ref = ref_latents.shape[1]
|
384 |
+
if self.use_camera_embedding:
|
385 |
+
camera_info_ref = cached_condition['camera_info_ref']
|
386 |
+
camera_info_ref = rearrange(camera_info_ref, 'b n -> (b n)')
|
387 |
+
else:
|
388 |
+
camera_info_ref = None
|
389 |
+
|
390 |
+
ref_latents = rearrange(ref_latents, 'b n c h w -> (b n) c h w')
|
391 |
+
|
392 |
+
encoder_hidden_states_ref = self.unet.learned_text_clip_ref.unsqueeze(1).repeat(B, N_ref, 1, 1)
|
393 |
+
encoder_hidden_states_ref = rearrange(encoder_hidden_states_ref, 'b n l c -> (b n) l c')
|
394 |
+
|
395 |
+
noisy_ref_latents = ref_latents
|
396 |
+
timestep_ref = 0
|
397 |
+
|
398 |
+
if self.use_dual_stream:
|
399 |
+
unet_ref = self.unet_dual
|
400 |
+
else:
|
401 |
+
unet_ref = self.unet
|
402 |
+
unet_ref(
|
403 |
+
noisy_ref_latents, timestep_ref,
|
404 |
+
encoder_hidden_states=encoder_hidden_states_ref,
|
405 |
+
class_labels=camera_info_ref,
|
406 |
+
# **kwargs
|
407 |
+
return_dict=False,
|
408 |
+
cross_attention_kwargs={
|
409 |
+
'mode': 'w', 'num_in_batch': N_ref,
|
410 |
+
'condition_embed_dict': condition_embed_dict},
|
411 |
+
)
|
412 |
+
cached_condition['condition_embed_dict'] = condition_embed_dict
|
413 |
+
else:
|
414 |
+
condition_embed_dict = None
|
415 |
+
|
416 |
+
mva_scale = cached_condition.get('mva_scale', 1.0)
|
417 |
+
ref_scale = cached_condition.get('ref_scale', 1.0)
|
418 |
+
|
419 |
+
return self.unet(
|
420 |
+
sample, timestep,
|
421 |
+
encoder_hidden_states_gen, *args,
|
422 |
+
class_labels=camera_info_gen,
|
423 |
+
down_intrablock_additional_residuals=[
|
424 |
+
sample.to(dtype=self.unet.dtype) for sample in down_intrablock_additional_residuals
|
425 |
+
] if down_intrablock_additional_residuals is not None else None,
|
426 |
+
down_block_additional_residuals=[
|
427 |
+
sample.to(dtype=self.unet.dtype) for sample in down_block_res_samples
|
428 |
+
] if down_block_res_samples is not None else None,
|
429 |
+
mid_block_additional_residual=(
|
430 |
+
mid_block_res_sample.to(dtype=self.unet.dtype)
|
431 |
+
if mid_block_res_sample is not None else None
|
432 |
+
),
|
433 |
+
return_dict=False,
|
434 |
+
cross_attention_kwargs={
|
435 |
+
'mode': 'r', 'num_in_batch': N_gen,
|
436 |
+
'condition_embed_dict': condition_embed_dict,
|
437 |
+
'mva_scale': mva_scale,
|
438 |
+
'ref_scale': ref_scale,
|
439 |
+
},
|
440 |
+
)
|
build/lib/hy3dgen/texgen/pipelines.py
ADDED
@@ -0,0 +1,227 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Open Source Model Licensed under the Apache License Version 2.0
|
2 |
+
# and Other Licenses of the Third-Party Components therein:
|
3 |
+
# The below Model in this distribution may have been modified by THL A29 Limited
|
4 |
+
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
|
5 |
+
|
6 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
7 |
+
# The below software and/or models in this distribution may have been
|
8 |
+
# modified by THL A29 Limited ("Tencent Modifications").
|
9 |
+
# All Tencent Modifications are Copyright (C) THL A29 Limited.
|
10 |
+
|
11 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
12 |
+
# except for the third-party components listed below.
|
13 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
14 |
+
# in the repsective licenses of these third-party components.
|
15 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
16 |
+
# components and must ensure that the usage of the third party components adheres to
|
17 |
+
# all relevant laws and regulations.
|
18 |
+
|
19 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
20 |
+
# their software and algorithms, including trained model weights, parameters (including
|
21 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
22 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
23 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
24 |
+
|
25 |
+
|
26 |
+
import logging
|
27 |
+
import os
|
28 |
+
|
29 |
+
import numpy as np
|
30 |
+
import torch
|
31 |
+
from PIL import Image
|
32 |
+
|
33 |
+
from .differentiable_renderer.mesh_render import MeshRender
|
34 |
+
from .utils.dehighlight_utils import Light_Shadow_Remover
|
35 |
+
from .utils.multiview_utils import Multiview_Diffusion_Net
|
36 |
+
from .utils.uv_warp_utils import mesh_uv_wrap
|
37 |
+
|
38 |
+
logger = logging.getLogger(__name__)
|
39 |
+
|
40 |
+
|
41 |
+
class Hunyuan3DTexGenConfig:
|
42 |
+
|
43 |
+
def __init__(self, light_remover_ckpt_path, multiview_ckpt_path):
|
44 |
+
self.device = 'cpu'
|
45 |
+
self.light_remover_ckpt_path = light_remover_ckpt_path
|
46 |
+
self.multiview_ckpt_path = multiview_ckpt_path
|
47 |
+
|
48 |
+
self.candidate_camera_azims = [0, 90, 180, 270, 0, 180]
|
49 |
+
self.candidate_camera_elevs = [0, 0, 0, 0, 90, -90]
|
50 |
+
self.candidate_view_weights = [1, 0.1, 0.5, 0.1, 0.05, 0.05]
|
51 |
+
|
52 |
+
self.render_size = 2048
|
53 |
+
self.texture_size = 1024
|
54 |
+
self.bake_exp = 4
|
55 |
+
self.merge_method = 'fast'
|
56 |
+
|
57 |
+
|
58 |
+
class Hunyuan3DPaintPipeline:
|
59 |
+
@classmethod
|
60 |
+
def from_pretrained(cls, model_path):
|
61 |
+
original_model_path = model_path
|
62 |
+
if not os.path.exists(model_path):
|
63 |
+
# try local path
|
64 |
+
base_dir = os.environ.get('HY3DGEN_MODELS', '~/.cache/hy3dgen')
|
65 |
+
model_path = os.path.expanduser(os.path.join(base_dir, model_path))
|
66 |
+
|
67 |
+
delight_model_path = os.path.join(model_path, 'hunyuan3d-delight-v2-0')
|
68 |
+
multiview_model_path = os.path.join(model_path, 'hunyuan3d-paint-v2-0')
|
69 |
+
|
70 |
+
if not os.path.exists(delight_model_path) or not os.path.exists(multiview_model_path):
|
71 |
+
try:
|
72 |
+
import huggingface_hub
|
73 |
+
# download from huggingface
|
74 |
+
model_path = huggingface_hub.snapshot_download(repo_id=original_model_path)
|
75 |
+
delight_model_path = os.path.join(model_path, 'hunyuan3d-delight-v2-0')
|
76 |
+
multiview_model_path = os.path.join(model_path, 'hunyuan3d-paint-v2-0')
|
77 |
+
return cls(Hunyuan3DTexGenConfig(delight_model_path, multiview_model_path))
|
78 |
+
except ImportError:
|
79 |
+
logger.warning(
|
80 |
+
"You need to install HuggingFace Hub to load models from the hub."
|
81 |
+
)
|
82 |
+
raise RuntimeError(f"Model path {model_path} not found")
|
83 |
+
else:
|
84 |
+
return cls(Hunyuan3DTexGenConfig(delight_model_path, multiview_model_path))
|
85 |
+
|
86 |
+
raise FileNotFoundError(f"Model path {original_model_path} not found and we could not find it at huggingface")
|
87 |
+
|
88 |
+
def __init__(self, config):
|
89 |
+
self.config = config
|
90 |
+
self.models = {}
|
91 |
+
self.render = MeshRender(
|
92 |
+
default_resolution=self.config.render_size,
|
93 |
+
texture_size=self.config.texture_size)
|
94 |
+
|
95 |
+
self.load_models()
|
96 |
+
|
97 |
+
def load_models(self):
|
98 |
+
# empty cude cache
|
99 |
+
torch.cuda.empty_cache()
|
100 |
+
# Load model
|
101 |
+
self.models['delight_model'] = Light_Shadow_Remover(self.config)
|
102 |
+
self.models['multiview_model'] = Multiview_Diffusion_Net(self.config)
|
103 |
+
|
104 |
+
def render_normal_multiview(self, camera_elevs, camera_azims, use_abs_coor=True):
|
105 |
+
normal_maps = []
|
106 |
+
for elev, azim in zip(camera_elevs, camera_azims):
|
107 |
+
normal_map = self.render.render_normal(
|
108 |
+
elev, azim, use_abs_coor=use_abs_coor, return_type='pl')
|
109 |
+
normal_maps.append(normal_map)
|
110 |
+
|
111 |
+
return normal_maps
|
112 |
+
|
113 |
+
def render_position_multiview(self, camera_elevs, camera_azims):
|
114 |
+
position_maps = []
|
115 |
+
for elev, azim in zip(camera_elevs, camera_azims):
|
116 |
+
position_map = self.render.render_position(
|
117 |
+
elev, azim, return_type='pl')
|
118 |
+
position_maps.append(position_map)
|
119 |
+
|
120 |
+
return position_maps
|
121 |
+
|
122 |
+
def bake_from_multiview(self, views, camera_elevs,
|
123 |
+
camera_azims, view_weights, method='graphcut'):
|
124 |
+
project_textures, project_weighted_cos_maps = [], []
|
125 |
+
project_boundary_maps = []
|
126 |
+
for view, camera_elev, camera_azim, weight in zip(
|
127 |
+
views, camera_elevs, camera_azims, view_weights):
|
128 |
+
project_texture, project_cos_map, project_boundary_map = self.render.back_project(
|
129 |
+
view, camera_elev, camera_azim)
|
130 |
+
project_cos_map = weight * (project_cos_map ** self.config.bake_exp)
|
131 |
+
project_textures.append(project_texture)
|
132 |
+
project_weighted_cos_maps.append(project_cos_map)
|
133 |
+
project_boundary_maps.append(project_boundary_map)
|
134 |
+
|
135 |
+
if method == 'fast':
|
136 |
+
texture, ori_trust_map = self.render.fast_bake_texture(
|
137 |
+
project_textures, project_weighted_cos_maps)
|
138 |
+
else:
|
139 |
+
raise f'no method {method}'
|
140 |
+
return texture, ori_trust_map > 1E-8
|
141 |
+
|
142 |
+
def texture_inpaint(self, texture, mask):
|
143 |
+
|
144 |
+
texture_np = self.render.uv_inpaint(texture, mask)
|
145 |
+
texture = torch.tensor(texture_np / 255).float().to(texture.device)
|
146 |
+
|
147 |
+
return texture
|
148 |
+
|
149 |
+
def recenter_image(self, image, border_ratio=0.2):
|
150 |
+
if image.mode == 'RGB':
|
151 |
+
return image
|
152 |
+
elif image.mode == 'L':
|
153 |
+
image = image.convert('RGB')
|
154 |
+
return image
|
155 |
+
|
156 |
+
alpha_channel = np.array(image)[:, :, 3]
|
157 |
+
non_zero_indices = np.argwhere(alpha_channel > 0)
|
158 |
+
if non_zero_indices.size == 0:
|
159 |
+
raise ValueError("Image is fully transparent")
|
160 |
+
|
161 |
+
min_row, min_col = non_zero_indices.min(axis=0)
|
162 |
+
max_row, max_col = non_zero_indices.max(axis=0)
|
163 |
+
|
164 |
+
cropped_image = image.crop((min_col, min_row, max_col + 1, max_row + 1))
|
165 |
+
|
166 |
+
width, height = cropped_image.size
|
167 |
+
border_width = int(width * border_ratio)
|
168 |
+
border_height = int(height * border_ratio)
|
169 |
+
|
170 |
+
new_width = width + 2 * border_width
|
171 |
+
new_height = height + 2 * border_height
|
172 |
+
|
173 |
+
square_size = max(new_width, new_height)
|
174 |
+
|
175 |
+
new_image = Image.new('RGBA', (square_size, square_size), (255, 255, 255, 0))
|
176 |
+
|
177 |
+
paste_x = (square_size - new_width) // 2 + border_width
|
178 |
+
paste_y = (square_size - new_height) // 2 + border_height
|
179 |
+
|
180 |
+
new_image.paste(cropped_image, (paste_x, paste_y))
|
181 |
+
return new_image
|
182 |
+
|
183 |
+
@torch.no_grad()
|
184 |
+
def __call__(self, mesh, image):
|
185 |
+
|
186 |
+
if isinstance(image, str):
|
187 |
+
image_prompt = Image.open(image)
|
188 |
+
else:
|
189 |
+
image_prompt = image
|
190 |
+
|
191 |
+
image_prompt = self.recenter_image(image_prompt)
|
192 |
+
|
193 |
+
image_prompt = self.models['delight_model'](image_prompt)
|
194 |
+
|
195 |
+
mesh = mesh_uv_wrap(mesh)
|
196 |
+
|
197 |
+
self.render.load_mesh(mesh)
|
198 |
+
|
199 |
+
selected_camera_elevs, selected_camera_azims, selected_view_weights = \
|
200 |
+
self.config.candidate_camera_elevs, self.config.candidate_camera_azims, self.config.candidate_view_weights
|
201 |
+
|
202 |
+
normal_maps = self.render_normal_multiview(
|
203 |
+
selected_camera_elevs, selected_camera_azims, use_abs_coor=True)
|
204 |
+
position_maps = self.render_position_multiview(
|
205 |
+
selected_camera_elevs, selected_camera_azims)
|
206 |
+
|
207 |
+
camera_info = [(((azim // 30) + 9) % 12) // {-20: 1, 0: 1, 20: 1, -90: 3, 90: 3}[
|
208 |
+
elev] + {-20: 0, 0: 12, 20: 24, -90: 36, 90: 40}[elev] for azim, elev in
|
209 |
+
zip(selected_camera_azims, selected_camera_elevs)]
|
210 |
+
multiviews = self.models['multiview_model'](image_prompt, normal_maps + position_maps, camera_info)
|
211 |
+
|
212 |
+
for i in range(len(multiviews)):
|
213 |
+
multiviews[i] = multiviews[i].resize(
|
214 |
+
(self.config.render_size, self.config.render_size))
|
215 |
+
|
216 |
+
texture, mask = self.bake_from_multiview(multiviews,
|
217 |
+
selected_camera_elevs, selected_camera_azims, selected_view_weights,
|
218 |
+
method=self.config.merge_method)
|
219 |
+
|
220 |
+
mask_np = (mask.squeeze(-1).cpu().numpy() * 255).astype(np.uint8)
|
221 |
+
|
222 |
+
texture = self.texture_inpaint(texture, mask_np)
|
223 |
+
|
224 |
+
self.render.set_texture(texture)
|
225 |
+
textured_mesh = self.render.save_mesh()
|
226 |
+
|
227 |
+
return textured_mesh
|
build/lib/hy3dgen/texgen/utils/__init__.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Open Source Model Licensed under the Apache License Version 2.0
|
2 |
+
# and Other Licenses of the Third-Party Components therein:
|
3 |
+
# The below Model in this distribution may have been modified by THL A29 Limited
|
4 |
+
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
|
5 |
+
|
6 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
7 |
+
# The below software and/or models in this distribution may have been
|
8 |
+
# modified by THL A29 Limited ("Tencent Modifications").
|
9 |
+
# All Tencent Modifications are Copyright (C) THL A29 Limited.
|
10 |
+
|
11 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
12 |
+
# except for the third-party components listed below.
|
13 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
14 |
+
# in the repsective licenses of these third-party components.
|
15 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
16 |
+
# components and must ensure that the usage of the third party components adheres to
|
17 |
+
# all relevant laws and regulations.
|
18 |
+
|
19 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
20 |
+
# their software and algorithms, including trained model weights, parameters (including
|
21 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
22 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
23 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
build/lib/hy3dgen/texgen/utils/alignImg4Tex_utils.py
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Open Source Model Licensed under the Apache License Version 2.0
|
2 |
+
# and Other Licenses of the Third-Party Components therein:
|
3 |
+
# The below Model in this distribution may have been modified by THL A29 Limited
|
4 |
+
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
|
5 |
+
|
6 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
7 |
+
# The below software and/or models in this distribution may have been
|
8 |
+
# modified by THL A29 Limited ("Tencent Modifications").
|
9 |
+
# All Tencent Modifications are Copyright (C) THL A29 Limited.
|
10 |
+
|
11 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
12 |
+
# except for the third-party components listed below.
|
13 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
14 |
+
# in the repsective licenses of these third-party components.
|
15 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
16 |
+
# components and must ensure that the usage of the third party components adheres to
|
17 |
+
# all relevant laws and regulations.
|
18 |
+
|
19 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
20 |
+
# their software and algorithms, including trained model weights, parameters (including
|
21 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
22 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
23 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
24 |
+
|
25 |
+
|
26 |
+
import torch
|
27 |
+
from diffusers import EulerAncestralDiscreteScheduler
|
28 |
+
from diffusers import StableDiffusionControlNetPipeline, StableDiffusionXLControlNetImg2ImgPipeline, ControlNetModel, \
|
29 |
+
AutoencoderKL
|
30 |
+
|
31 |
+
|
32 |
+
class Img2img_Control_Ip_adapter:
|
33 |
+
def __init__(self, device):
|
34 |
+
controlnet = ControlNetModel.from_pretrained('lllyasviel/control_v11f1p_sd15_depth', torch_dtype=torch.float16,
|
35 |
+
variant="fp16", use_safetensors=True)
|
36 |
+
pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
37 |
+
'runwayml/stable-diffusion-v1-5', controlnet=controlnet, torch_dtype=torch.float16, use_safetensors=True
|
38 |
+
)
|
39 |
+
pipe.load_ip_adapter('h94/IP-Adapter', subfolder="models", weight_name="ip-adapter-plus_sd15.safetensors")
|
40 |
+
pipe.set_ip_adapter_scale(0.7)
|
41 |
+
|
42 |
+
pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
|
43 |
+
# pipe.enable_model_cpu_offload()
|
44 |
+
self.pipe = pipe.to(device)
|
45 |
+
|
46 |
+
def __call__(
|
47 |
+
self,
|
48 |
+
prompt,
|
49 |
+
control_image,
|
50 |
+
ip_adapter_image,
|
51 |
+
negative_prompt,
|
52 |
+
height=512,
|
53 |
+
width=512,
|
54 |
+
num_inference_steps=20,
|
55 |
+
guidance_scale=8.0,
|
56 |
+
controlnet_conditioning_scale=1.0,
|
57 |
+
output_type="pil",
|
58 |
+
**kwargs,
|
59 |
+
):
|
60 |
+
results = self.pipe(
|
61 |
+
prompt=prompt,
|
62 |
+
negative_prompt=negative_prompt,
|
63 |
+
image=control_image,
|
64 |
+
ip_adapter_image=ip_adapter_image,
|
65 |
+
generator=torch.manual_seed(42),
|
66 |
+
seed=42,
|
67 |
+
num_inference_steps=num_inference_steps,
|
68 |
+
guidance_scale=guidance_scale,
|
69 |
+
controlnet_conditioning_scale=controlnet_conditioning_scale,
|
70 |
+
strength=1,
|
71 |
+
# clip_skip=2,
|
72 |
+
height=height,
|
73 |
+
width=width,
|
74 |
+
output_type=output_type,
|
75 |
+
**kwargs,
|
76 |
+
).images[0]
|
77 |
+
return results
|
78 |
+
|
79 |
+
|
80 |
+
################################################################
|
81 |
+
|
82 |
+
class HesModel:
|
83 |
+
def __init__(self, ):
|
84 |
+
controlnet_depth = ControlNetModel.from_pretrained(
|
85 |
+
'diffusers/controlnet-depth-sdxl-1.0',
|
86 |
+
torch_dtype=torch.float16,
|
87 |
+
variant="fp16",
|
88 |
+
use_safetensors=True
|
89 |
+
)
|
90 |
+
self.pipe = StableDiffusionXLControlNetImg2ImgPipeline.from_pretrained(
|
91 |
+
'stabilityai/stable-diffusion-xl-base-1.0',
|
92 |
+
torch_dtype=torch.float16,
|
93 |
+
variant="fp16",
|
94 |
+
controlnet=controlnet_depth,
|
95 |
+
use_safetensors=True,
|
96 |
+
)
|
97 |
+
self.pipe.vae = AutoencoderKL.from_pretrained(
|
98 |
+
'madebyollin/sdxl-vae-fp16-fix',
|
99 |
+
torch_dtype=torch.float16
|
100 |
+
)
|
101 |
+
|
102 |
+
self.pipe.load_ip_adapter('h94/IP-Adapter', subfolder="sdxl_models", weight_name="ip-adapter_sdxl.safetensors")
|
103 |
+
self.pipe.set_ip_adapter_scale(0.7)
|
104 |
+
self.pipe.to("cuda")
|
105 |
+
|
106 |
+
def __call__(self,
|
107 |
+
init_image,
|
108 |
+
control_image,
|
109 |
+
ip_adapter_image=None,
|
110 |
+
prompt='3D image',
|
111 |
+
negative_prompt='2D image',
|
112 |
+
seed=42,
|
113 |
+
strength=0.8,
|
114 |
+
num_inference_steps=40,
|
115 |
+
guidance_scale=7.5,
|
116 |
+
controlnet_conditioning_scale=0.5,
|
117 |
+
**kwargs
|
118 |
+
):
|
119 |
+
image = self.pipe(
|
120 |
+
prompt=prompt,
|
121 |
+
image=init_image,
|
122 |
+
control_image=control_image,
|
123 |
+
ip_adapter_image=ip_adapter_image,
|
124 |
+
negative_prompt=negative_prompt,
|
125 |
+
num_inference_steps=num_inference_steps,
|
126 |
+
guidance_scale=guidance_scale,
|
127 |
+
strength=strength,
|
128 |
+
controlnet_conditioning_scale=controlnet_conditioning_scale,
|
129 |
+
seed=seed,
|
130 |
+
**kwargs
|
131 |
+
).images[0]
|
132 |
+
return image
|
build/lib/hy3dgen/texgen/utils/counter_utils.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Open Source Model Licensed under the Apache License Version 2.0
|
2 |
+
# and Other Licenses of the Third-Party Components therein:
|
3 |
+
# The below Model in this distribution may have been modified by THL A29 Limited
|
4 |
+
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
|
5 |
+
|
6 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
7 |
+
# The below software and/or models in this distribution may have been
|
8 |
+
# modified by THL A29 Limited ("Tencent Modifications").
|
9 |
+
# All Tencent Modifications are Copyright (C) THL A29 Limited.
|
10 |
+
|
11 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
12 |
+
# except for the third-party components listed below.
|
13 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
14 |
+
# in the repsective licenses of these third-party components.
|
15 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
16 |
+
# components and must ensure that the usage of the third party components adheres to
|
17 |
+
# all relevant laws and regulations.
|
18 |
+
|
19 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
20 |
+
# their software and algorithms, including trained model weights, parameters (including
|
21 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
22 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
23 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
24 |
+
|
25 |
+
|
26 |
+
class RunningStats():
|
27 |
+
def __init__(self) -> None:
|
28 |
+
self.count = 0
|
29 |
+
self.sum = 0
|
30 |
+
self.mean = 0
|
31 |
+
self.min = None
|
32 |
+
self.max = None
|
33 |
+
|
34 |
+
def add_value(self, value):
|
35 |
+
self.count += 1
|
36 |
+
self.sum += value
|
37 |
+
self.mean = self.sum / self.count
|
38 |
+
|
39 |
+
if self.min is None or value < self.min:
|
40 |
+
self.min = value
|
41 |
+
|
42 |
+
if self.max is None or value > self.max:
|
43 |
+
self.max = value
|
44 |
+
|
45 |
+
def get_count(self):
|
46 |
+
return self.count
|
47 |
+
|
48 |
+
def get_sum(self):
|
49 |
+
return self.sum
|
50 |
+
|
51 |
+
def get_mean(self):
|
52 |
+
return self.mean
|
53 |
+
|
54 |
+
def get_min(self):
|
55 |
+
return self.min
|
56 |
+
|
57 |
+
def get_max(self):
|
58 |
+
return self.max
|
build/lib/hy3dgen/texgen/utils/dehighlight_utils.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Open Source Model Licensed under the Apache License Version 2.0
|
2 |
+
# and Other Licenses of the Third-Party Components therein:
|
3 |
+
# The below Model in this distribution may have been modified by THL A29 Limited
|
4 |
+
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
|
5 |
+
|
6 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
7 |
+
# The below software and/or models in this distribution may have been
|
8 |
+
# modified by THL A29 Limited ("Tencent Modifications").
|
9 |
+
# All Tencent Modifications are Copyright (C) THL A29 Limited.
|
10 |
+
|
11 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
12 |
+
# except for the third-party components listed below.
|
13 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
14 |
+
# in the repsective licenses of these third-party components.
|
15 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
16 |
+
# components and must ensure that the usage of the third party components adheres to
|
17 |
+
# all relevant laws and regulations.
|
18 |
+
|
19 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
20 |
+
# their software and algorithms, including trained model weights, parameters (including
|
21 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
22 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
23 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
24 |
+
|
25 |
+
import cv2
|
26 |
+
import numpy as np
|
27 |
+
import torch
|
28 |
+
from PIL import Image
|
29 |
+
from diffusers import StableDiffusionInstructPix2PixPipeline, EulerAncestralDiscreteScheduler
|
30 |
+
|
31 |
+
|
32 |
+
class Light_Shadow_Remover():
|
33 |
+
def __init__(self, config):
|
34 |
+
self.device = config.device
|
35 |
+
self.cfg_image = 1.5
|
36 |
+
self.cfg_text = 1.0
|
37 |
+
|
38 |
+
pipeline = StableDiffusionInstructPix2PixPipeline.from_pretrained(
|
39 |
+
config.light_remover_ckpt_path,
|
40 |
+
torch_dtype=torch.float16,
|
41 |
+
safety_checker=None,
|
42 |
+
)
|
43 |
+
pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(pipeline.scheduler.config)
|
44 |
+
pipeline.set_progress_bar_config(disable=True)
|
45 |
+
|
46 |
+
# self.pipeline = pipeline.to(self.device, torch.float16)
|
47 |
+
self.pipeline = pipeline # Needed to avoid displaying the warning
|
48 |
+
@torch.no_grad()
|
49 |
+
def __call__(self, image):
|
50 |
+
|
51 |
+
image = image.resize((512, 512))
|
52 |
+
|
53 |
+
if image.mode == 'RGBA':
|
54 |
+
image_array = np.array(image)
|
55 |
+
alpha_channel = image_array[:, :, 3]
|
56 |
+
erosion_size = 3
|
57 |
+
kernel = np.ones((erosion_size, erosion_size), np.uint8)
|
58 |
+
alpha_channel = cv2.erode(alpha_channel, kernel, iterations=1)
|
59 |
+
image_array[alpha_channel == 0, :3] = 255
|
60 |
+
image_array[:, :, 3] = alpha_channel
|
61 |
+
image = Image.fromarray(image_array)
|
62 |
+
|
63 |
+
image_tensor = torch.tensor(np.array(image) / 255.0).to(self.device)
|
64 |
+
alpha = image_tensor[:, :, 3:]
|
65 |
+
rgb_target = image_tensor[:, :, :3]
|
66 |
+
else:
|
67 |
+
image_tensor = torch.tensor(np.array(image) / 255.0).to(self.device)
|
68 |
+
alpha = torch.ones_like(image_tensor)[:, :, :1]
|
69 |
+
rgb_target = image_tensor[:, :, :3]
|
70 |
+
|
71 |
+
image = image.convert('RGB')
|
72 |
+
|
73 |
+
image = self.pipeline(
|
74 |
+
prompt="",
|
75 |
+
image=image,
|
76 |
+
generator=torch.manual_seed(42),
|
77 |
+
height=512,
|
78 |
+
width=512,
|
79 |
+
num_inference_steps=50,
|
80 |
+
image_guidance_scale=self.cfg_image,
|
81 |
+
guidance_scale=self.cfg_text,
|
82 |
+
).images[0]
|
83 |
+
|
84 |
+
return image
|
build/lib/hy3dgen/texgen/utils/multiview_utils.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Open Source Model Licensed under the Apache License Version 2.0
|
2 |
+
# and Other Licenses of the Third-Party Components therein:
|
3 |
+
# The below Model in this distribution may have been modified by THL A29 Limited
|
4 |
+
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
|
5 |
+
|
6 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
7 |
+
# The below software and/or models in this distribution may have been
|
8 |
+
# modified by THL A29 Limited ("Tencent Modifications").
|
9 |
+
# All Tencent Modifications are Copyright (C) THL A29 Limited.
|
10 |
+
|
11 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
12 |
+
# except for the third-party components listed below.
|
13 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
14 |
+
# in the repsective licenses of these third-party components.
|
15 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
16 |
+
# components and must ensure that the usage of the third party components adheres to
|
17 |
+
# all relevant laws and regulations.
|
18 |
+
|
19 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
20 |
+
# their software and algorithms, including trained model weights, parameters (including
|
21 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
22 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
23 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
24 |
+
|
25 |
+
import os
|
26 |
+
import random
|
27 |
+
|
28 |
+
import numpy as np
|
29 |
+
import torch
|
30 |
+
from diffusers import DiffusionPipeline
|
31 |
+
from diffusers import EulerAncestralDiscreteScheduler
|
32 |
+
|
33 |
+
|
34 |
+
class Multiview_Diffusion_Net():
|
35 |
+
def __init__(self, config) -> None:
|
36 |
+
self.device = config.device
|
37 |
+
self.view_size = 512
|
38 |
+
multiview_ckpt_path = config.multiview_ckpt_path
|
39 |
+
|
40 |
+
current_file_path = os.path.abspath(__file__)
|
41 |
+
custom_pipeline_path = os.path.join(os.path.dirname(current_file_path), '..', 'hunyuanpaint')
|
42 |
+
|
43 |
+
pipeline = DiffusionPipeline.from_pretrained(
|
44 |
+
multiview_ckpt_path,
|
45 |
+
custom_pipeline=custom_pipeline_path, torch_dtype=torch.float16)
|
46 |
+
|
47 |
+
pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(pipeline.scheduler.config,
|
48 |
+
timestep_spacing='trailing')
|
49 |
+
|
50 |
+
pipeline.set_progress_bar_config(disable=True)
|
51 |
+
self.pipeline = pipeline #.to(self.device) # only for cosmetics and not display the warning
|
52 |
+
|
53 |
+
def seed_everything(self, seed):
|
54 |
+
random.seed(seed)
|
55 |
+
np.random.seed(seed)
|
56 |
+
torch.manual_seed(seed)
|
57 |
+
os.environ["PL_GLOBAL_SEED"] = str(seed)
|
58 |
+
|
59 |
+
def __call__(self, input_image, control_images, camera_info):
|
60 |
+
|
61 |
+
self.seed_everything(0)
|
62 |
+
|
63 |
+
input_image = input_image.resize((self.view_size, self.view_size))
|
64 |
+
for i in range(len(control_images)):
|
65 |
+
control_images[i] = control_images[i].resize((self.view_size, self.view_size))
|
66 |
+
if control_images[i].mode == 'L':
|
67 |
+
control_images[i] = control_images[i].point(lambda x: 255 if x > 1 else 0, mode='1')
|
68 |
+
|
69 |
+
kwargs = dict(generator=torch.Generator(device=self.pipeline.device).manual_seed(0))
|
70 |
+
|
71 |
+
num_view = len(control_images) // 2
|
72 |
+
normal_image = [[control_images[i] for i in range(num_view)]]
|
73 |
+
position_image = [[control_images[i + num_view] for i in range(num_view)]]
|
74 |
+
|
75 |
+
camera_info_gen = [camera_info]
|
76 |
+
camera_info_ref = [[0]]
|
77 |
+
kwargs['width'] = self.view_size
|
78 |
+
kwargs['height'] = self.view_size
|
79 |
+
kwargs['num_in_batch'] = num_view
|
80 |
+
kwargs['camera_info_gen'] = camera_info_gen
|
81 |
+
kwargs['camera_info_ref'] = camera_info_ref
|
82 |
+
kwargs["normal_imgs"] = normal_image
|
83 |
+
kwargs["position_imgs"] = position_image
|
84 |
+
|
85 |
+
mvd_image = self.pipeline(input_image, num_inference_steps=30, **kwargs).images
|
86 |
+
return mvd_image
|
build/lib/hy3dgen/texgen/utils/simplify_mesh_utils.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Open Source Model Licensed under the Apache License Version 2.0
|
2 |
+
# and Other Licenses of the Third-Party Components therein:
|
3 |
+
# The below Model in this distribution may have been modified by THL A29 Limited
|
4 |
+
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
|
5 |
+
|
6 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
7 |
+
# The below software and/or models in this distribution may have been
|
8 |
+
# modified by THL A29 Limited ("Tencent Modifications").
|
9 |
+
# All Tencent Modifications are Copyright (C) THL A29 Limited.
|
10 |
+
|
11 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
12 |
+
# except for the third-party components listed below.
|
13 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
14 |
+
# in the repsective licenses of these third-party components.
|
15 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
16 |
+
# components and must ensure that the usage of the third party components adheres to
|
17 |
+
# all relevant laws and regulations.
|
18 |
+
|
19 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
20 |
+
# their software and algorithms, including trained model weights, parameters (including
|
21 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
22 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
23 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
24 |
+
|
25 |
+
import trimesh
|
26 |
+
|
27 |
+
|
28 |
+
def remesh_mesh(mesh_path, remesh_path, method='trimesh'):
|
29 |
+
if method == 'trimesh':
|
30 |
+
mesh_simplify_trimesh(mesh_path, remesh_path)
|
31 |
+
else:
|
32 |
+
raise f'Method {method} has not been implemented.'
|
33 |
+
|
34 |
+
|
35 |
+
def mesh_simplify_trimesh(inputpath, outputpath):
|
36 |
+
import pymeshlab
|
37 |
+
ms = pymeshlab.MeshSet()
|
38 |
+
ms.load_new_mesh(inputpath, load_in_a_single_layer=True)
|
39 |
+
ms.save_current_mesh(outputpath.replace('.glb', '.obj'), save_textures=False)
|
40 |
+
|
41 |
+
courent = trimesh.load(outputpath.replace('.glb', '.obj'), force='mesh')
|
42 |
+
face_num = courent.faces.shape[0]
|
43 |
+
|
44 |
+
if face_num > 100000:
|
45 |
+
courent = courent.simplify_quadric_decimation(40000)
|
46 |
+
courent.export(outputpath)
|
build/lib/hy3dgen/texgen/utils/uv_warp_utils.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Open Source Model Licensed under the Apache License Version 2.0
|
2 |
+
# and Other Licenses of the Third-Party Components therein:
|
3 |
+
# The below Model in this distribution may have been modified by THL A29 Limited
|
4 |
+
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
|
5 |
+
|
6 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
7 |
+
# The below software and/or models in this distribution may have been
|
8 |
+
# modified by THL A29 Limited ("Tencent Modifications").
|
9 |
+
# All Tencent Modifications are Copyright (C) THL A29 Limited.
|
10 |
+
|
11 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
12 |
+
# except for the third-party components listed below.
|
13 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
14 |
+
# in the repsective licenses of these third-party components.
|
15 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
16 |
+
# components and must ensure that the usage of the third party components adheres to
|
17 |
+
# all relevant laws and regulations.
|
18 |
+
|
19 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
20 |
+
# their software and algorithms, including trained model weights, parameters (including
|
21 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
22 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
23 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
24 |
+
|
25 |
+
import trimesh
|
26 |
+
import xatlas
|
27 |
+
|
28 |
+
|
29 |
+
def mesh_uv_wrap(mesh):
|
30 |
+
if isinstance(mesh, trimesh.Scene):
|
31 |
+
mesh = mesh.dump(concatenate=True)
|
32 |
+
|
33 |
+
# if len(mesh.faces) > 50000:
|
34 |
+
# raise ValueError("The mesh has more than 50,000 faces, which is not supported.")
|
35 |
+
|
36 |
+
vmapping, indices, uvs = xatlas.parametrize(mesh.vertices, mesh.faces)
|
37 |
+
|
38 |
+
mesh.vertices = mesh.vertices[vmapping]
|
39 |
+
mesh.faces = indices
|
40 |
+
mesh.visual.uv = uvs
|
41 |
+
|
42 |
+
return mesh
|
build/lib/hy3dgen/text2image.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Open Source Model Licensed under the Apache License Version 2.0
|
2 |
+
# and Other Licenses of the Third-Party Components therein:
|
3 |
+
# The below Model in this distribution may have been modified by THL A29 Limited
|
4 |
+
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
|
5 |
+
|
6 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
7 |
+
# The below software and/or models in this distribution may have been
|
8 |
+
# modified by THL A29 Limited ("Tencent Modifications").
|
9 |
+
# All Tencent Modifications are Copyright (C) THL A29 Limited.
|
10 |
+
|
11 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
12 |
+
# except for the third-party components listed below.
|
13 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
14 |
+
# in the repsective licenses of these third-party components.
|
15 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
16 |
+
# components and must ensure that the usage of the third party components adheres to
|
17 |
+
# all relevant laws and regulations.
|
18 |
+
|
19 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
20 |
+
# their software and algorithms, including trained model weights, parameters (including
|
21 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
22 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
23 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
24 |
+
|
25 |
+
|
26 |
+
import os
|
27 |
+
import random
|
28 |
+
|
29 |
+
import numpy as np
|
30 |
+
import torch
|
31 |
+
from diffusers import AutoPipelineForText2Image
|
32 |
+
|
33 |
+
|
34 |
+
def seed_everything(seed):
|
35 |
+
random.seed(seed)
|
36 |
+
np.random.seed(seed)
|
37 |
+
torch.manual_seed(seed)
|
38 |
+
os.environ["PL_GLOBAL_SEED"] = str(seed)
|
39 |
+
|
40 |
+
|
41 |
+
class HunyuanDiTPipeline:
|
42 |
+
def __init__(
|
43 |
+
self,
|
44 |
+
model_path="Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers-Distilled",
|
45 |
+
device='cpu'
|
46 |
+
):
|
47 |
+
torch.set_default_device('cpu')
|
48 |
+
self.device = device
|
49 |
+
self.pipe = AutoPipelineForText2Image.from_pretrained(
|
50 |
+
model_path,
|
51 |
+
torch_dtype=torch.float16,
|
52 |
+
enable_pag=True,
|
53 |
+
pag_applied_layers=["blocks.(16|17|18|19)"]
|
54 |
+
) # .to(device) # needed to avoid displaying the warning
|
55 |
+
self.pos_txt = ",白色背景,3D风格,最佳质量"
|
56 |
+
self.neg_txt = "文本,特写,裁剪,出框,最差质量,低质量,JPEG伪影,PGLY,重复,病态," \
|
57 |
+
"残缺,多余的手指,变异的手,画得不好的手,画得不好的脸,变异,畸形,模糊,脱水,糟糕的解剖学," \
|
58 |
+
"糟糕的比例,多余的肢体,克隆的脸,毁容,恶心的比例,畸形的肢体,缺失的手臂,缺失的腿," \
|
59 |
+
"额外的手臂,额外的腿,融合的手指,手指太多,长脖子"
|
60 |
+
|
61 |
+
def compile(self):
|
62 |
+
# accelarate hunyuan-dit transformer,first inference will cost long time
|
63 |
+
torch.set_float32_matmul_precision('high')
|
64 |
+
self.pipe.transformer = torch.compile(self.pipe.transformer, fullgraph=True)
|
65 |
+
# self.pipe.vae.decode = torch.compile(self.pipe.vae.decode, fullgraph=True)
|
66 |
+
generator = torch.Generator(device=self.pipe.device) # infer once for hot-start
|
67 |
+
out_img = self.pipe(
|
68 |
+
prompt='美少女战士',
|
69 |
+
negative_prompt='模糊',
|
70 |
+
num_inference_steps=25,
|
71 |
+
pag_scale=1.3,
|
72 |
+
width=1024,
|
73 |
+
height=1024,
|
74 |
+
generator=generator,
|
75 |
+
return_dict=False
|
76 |
+
)[0][0]
|
77 |
+
|
78 |
+
@torch.no_grad()
|
79 |
+
def __call__(self, prompt, seed=0):
|
80 |
+
seed_everything(seed)
|
81 |
+
generator = torch.Generator(device="cuda") #self.pipe.device
|
82 |
+
generator = generator.manual_seed(int(seed))
|
83 |
+
out_img = self.pipe(
|
84 |
+
prompt=self.pos_txt+prompt,
|
85 |
+
negative_prompt=self.neg_txt,
|
86 |
+
num_inference_steps=20,
|
87 |
+
pag_scale=1.3,
|
88 |
+
width=1024,
|
89 |
+
height=1024,
|
90 |
+
generator=generator,
|
91 |
+
return_dict=False
|
92 |
+
)[0][0]
|
93 |
+
return out_img
|
dist/hy3dgen-2.0.0-py3.12.egg
ADDED
Binary file (189 kB). View file
|
|
hy3dgen.egg-info/PKG-INFO
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
Metadata-Version: 2.2
|
2 |
+
Name: hy3dgen
|
3 |
+
Version: 2.0.0
|
hy3dgen.egg-info/SOURCES.txt
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
README.md
|
2 |
+
setup.py
|
3 |
+
hy3dgen/__init__.py
|
4 |
+
hy3dgen/rembg.py
|
5 |
+
hy3dgen/text2image.py
|
6 |
+
hy3dgen.egg-info/PKG-INFO
|
7 |
+
hy3dgen.egg-info/SOURCES.txt
|
8 |
+
hy3dgen.egg-info/dependency_links.txt
|
9 |
+
hy3dgen.egg-info/top_level.txt
|
10 |
+
hy3dgen/shapegen/__init__.py
|
11 |
+
hy3dgen/shapegen/pipelines.py
|
12 |
+
hy3dgen/shapegen/postprocessors.py
|
13 |
+
hy3dgen/shapegen/preprocessors.py
|
14 |
+
hy3dgen/shapegen/schedulers.py
|
15 |
+
hy3dgen/shapegen/models/__init__.py
|
16 |
+
hy3dgen/shapegen/models/conditioner.py
|
17 |
+
hy3dgen/shapegen/models/hunyuan3ddit.py
|
18 |
+
hy3dgen/shapegen/models/vae.py
|
19 |
+
hy3dgen/texgen/__init__.py
|
20 |
+
hy3dgen/texgen/pipelines.py
|
21 |
+
hy3dgen/texgen/differentiable_renderer/__init__.py
|
22 |
+
hy3dgen/texgen/differentiable_renderer/camera_utils.py
|
23 |
+
hy3dgen/texgen/differentiable_renderer/mesh_processor.py
|
24 |
+
hy3dgen/texgen/differentiable_renderer/mesh_render.py
|
25 |
+
hy3dgen/texgen/differentiable_renderer/mesh_utils.py
|
26 |
+
hy3dgen/texgen/differentiable_renderer/setup.py
|
27 |
+
hy3dgen/texgen/hunyuanpaint/__init__.py
|
28 |
+
hy3dgen/texgen/hunyuanpaint/pipeline.py
|
29 |
+
hy3dgen/texgen/hunyuanpaint/unet/__init__.py
|
30 |
+
hy3dgen/texgen/hunyuanpaint/unet/modules.py
|
31 |
+
hy3dgen/texgen/utils/__init__.py
|
32 |
+
hy3dgen/texgen/utils/alignImg4Tex_utils.py
|
33 |
+
hy3dgen/texgen/utils/counter_utils.py
|
34 |
+
hy3dgen/texgen/utils/dehighlight_utils.py
|
35 |
+
hy3dgen/texgen/utils/multiview_utils.py
|
36 |
+
hy3dgen/texgen/utils/simplify_mesh_utils.py
|
37 |
+
hy3dgen/texgen/utils/uv_warp_utils.py
|
hy3dgen.egg-info/dependency_links.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
|
hy3dgen.egg-info/top_level.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
hy3dgen
|
hy3dgen/__init__.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Open Source Model Licensed under the Apache License Version 2.0
|
2 |
+
# and Other Licenses of the Third-Party Components therein:
|
3 |
+
# The below Model in this distribution may have been modified by THL A29 Limited
|
4 |
+
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
|
5 |
+
|
6 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
7 |
+
# The below software and/or models in this distribution may have been
|
8 |
+
# modified by THL A29 Limited ("Tencent Modifications").
|
9 |
+
# All Tencent Modifications are Copyright (C) THL A29 Limited.
|
10 |
+
|
11 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
12 |
+
# except for the third-party components listed below.
|
13 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
14 |
+
# in the repsective licenses of these third-party components.
|
15 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
16 |
+
# components and must ensure that the usage of the third party components adheres to
|
17 |
+
# all relevant laws and regulations.
|
18 |
+
|
19 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
20 |
+
# their software and algorithms, including trained model weights, parameters (including
|
21 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
22 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
23 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
hy3dgen/rembg.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Open Source Model Licensed under the Apache License Version 2.0
|
2 |
+
# and Other Licenses of the Third-Party Components therein:
|
3 |
+
# The below Model in this distribution may have been modified by THL A29 Limited
|
4 |
+
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
|
5 |
+
|
6 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
7 |
+
# The below software and/or models in this distribution may have been
|
8 |
+
# modified by THL A29 Limited ("Tencent Modifications").
|
9 |
+
# All Tencent Modifications are Copyright (C) THL A29 Limited.
|
10 |
+
|
11 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
12 |
+
# except for the third-party components listed below.
|
13 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
14 |
+
# in the repsective licenses of these third-party components.
|
15 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
16 |
+
# components and must ensure that the usage of the third party components adheres to
|
17 |
+
# all relevant laws and regulations.
|
18 |
+
|
19 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
20 |
+
# their software and algorithms, including trained model weights, parameters (including
|
21 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
22 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
23 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
24 |
+
|
25 |
+
|
26 |
+
from PIL import Image
|
27 |
+
from rembg import remove, new_session
|
28 |
+
|
29 |
+
|
30 |
+
class BackgroundRemover():
|
31 |
+
def __init__(self):
|
32 |
+
self.session = new_session()
|
33 |
+
|
34 |
+
def __call__(self, image: Image.Image):
|
35 |
+
output = remove(image, session=self.session, bgcolor=[255, 255, 255, 0])
|
36 |
+
return output
|
hy3dgen/shapegen/__init__.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Open Source Model Licensed under the Apache License Version 2.0
|
2 |
+
# and Other Licenses of the Third-Party Components therein:
|
3 |
+
# The below Model in this distribution may have been modified by THL A29 Limited
|
4 |
+
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
|
5 |
+
|
6 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
7 |
+
# The below software and/or models in this distribution may have been
|
8 |
+
# modified by THL A29 Limited ("Tencent Modifications").
|
9 |
+
# All Tencent Modifications are Copyright (C) THL A29 Limited.
|
10 |
+
|
11 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
12 |
+
# except for the third-party components listed below.
|
13 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
14 |
+
# in the repsective licenses of these third-party components.
|
15 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
16 |
+
# components and must ensure that the usage of the third party components adheres to
|
17 |
+
# all relevant laws and regulations.
|
18 |
+
|
19 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
20 |
+
# their software and algorithms, including trained model weights, parameters (including
|
21 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
22 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
23 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
24 |
+
|
25 |
+
from .pipelines import Hunyuan3DDiTPipeline, Hunyuan3DDiTFlowMatchingPipeline
|
26 |
+
from .postprocessors import FaceReducer, FloaterRemover, DegenerateFaceRemover
|
27 |
+
from .preprocessors import ImageProcessorV2, IMAGE_PROCESSORS, DEFAULT_IMAGEPROCESSOR
|
hy3dgen/shapegen/models/__init__.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Open Source Model Licensed under the Apache License Version 2.0
|
2 |
+
# and Other Licenses of the Third-Party Components therein:
|
3 |
+
# The below Model in this distribution may have been modified by THL A29 Limited
|
4 |
+
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
|
5 |
+
|
6 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
7 |
+
# The below software and/or models in this distribution may have been
|
8 |
+
# modified by THL A29 Limited ("Tencent Modifications").
|
9 |
+
# All Tencent Modifications are Copyright (C) THL A29 Limited.
|
10 |
+
|
11 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
12 |
+
# except for the third-party components listed below.
|
13 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
14 |
+
# in the repsective licenses of these third-party components.
|
15 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
16 |
+
# components and must ensure that the usage of the third party components adheres to
|
17 |
+
# all relevant laws and regulations.
|
18 |
+
|
19 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
20 |
+
# their software and algorithms, including trained model weights, parameters (including
|
21 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
22 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
23 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
24 |
+
|
25 |
+
|
26 |
+
from .conditioner import DualImageEncoder, SingleImageEncoder, DinoImageEncoder, CLIPImageEncoder
|
27 |
+
from .hunyuan3ddit import Hunyuan3DDiT
|
28 |
+
from .vae import ShapeVAE
|
hy3dgen/shapegen/models/conditioner.py
ADDED
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Open Source Model Licensed under the Apache License Version 2.0
|
2 |
+
# and Other Licenses of the Third-Party Components therein:
|
3 |
+
# The below Model in this distribution may have been modified by THL A29 Limited
|
4 |
+
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
|
5 |
+
|
6 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
7 |
+
# The below software and/or models in this distribution may have been
|
8 |
+
# modified by THL A29 Limited ("Tencent Modifications").
|
9 |
+
# All Tencent Modifications are Copyright (C) THL A29 Limited.
|
10 |
+
|
11 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
12 |
+
# except for the third-party components listed below.
|
13 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
14 |
+
# in the repsective licenses of these third-party components.
|
15 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
16 |
+
# components and must ensure that the usage of the third party components adheres to
|
17 |
+
# all relevant laws and regulations.
|
18 |
+
|
19 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
20 |
+
# their software and algorithms, including trained model weights, parameters (including
|
21 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
22 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
23 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
24 |
+
|
25 |
+
import torch
|
26 |
+
import torch.nn as nn
|
27 |
+
from torchvision import transforms
|
28 |
+
from transformers import (
|
29 |
+
CLIPVisionModelWithProjection,
|
30 |
+
CLIPVisionConfig,
|
31 |
+
Dinov2Model,
|
32 |
+
Dinov2Config,
|
33 |
+
)
|
34 |
+
|
35 |
+
|
36 |
+
class ImageEncoder(nn.Module):
|
37 |
+
def __init__(
|
38 |
+
self,
|
39 |
+
version=None,
|
40 |
+
config=None,
|
41 |
+
use_cls_token=True,
|
42 |
+
image_size=224,
|
43 |
+
**kwargs,
|
44 |
+
):
|
45 |
+
super().__init__()
|
46 |
+
|
47 |
+
if config is None:
|
48 |
+
self.model = self.MODEL_CLASS.from_pretrained(version)
|
49 |
+
else:
|
50 |
+
self.model = self.MODEL_CLASS(self.MODEL_CONFIG_CLASS.from_dict(config))
|
51 |
+
self.model.eval()
|
52 |
+
self.model.requires_grad_(False)
|
53 |
+
self.use_cls_token = use_cls_token
|
54 |
+
self.size = image_size // 14
|
55 |
+
self.num_patches = (image_size // 14) ** 2
|
56 |
+
if self.use_cls_token:
|
57 |
+
self.num_patches += 1
|
58 |
+
|
59 |
+
self.transform = transforms.Compose(
|
60 |
+
[
|
61 |
+
transforms.Resize(image_size, transforms.InterpolationMode.BILINEAR, antialias=True),
|
62 |
+
transforms.CenterCrop(image_size),
|
63 |
+
transforms.Normalize(
|
64 |
+
mean=self.mean,
|
65 |
+
std=self.std,
|
66 |
+
),
|
67 |
+
]
|
68 |
+
)
|
69 |
+
|
70 |
+
def forward(self, image, mask=None, value_range=(-1, 1)):
|
71 |
+
if value_range is not None:
|
72 |
+
low, high = value_range
|
73 |
+
image = (image - low) / (high - low)
|
74 |
+
|
75 |
+
image = image.to(self.model.device, dtype=self.model.dtype)
|
76 |
+
inputs = self.transform(image)
|
77 |
+
outputs = self.model(inputs)
|
78 |
+
|
79 |
+
last_hidden_state = outputs.last_hidden_state
|
80 |
+
if not self.use_cls_token:
|
81 |
+
last_hidden_state = last_hidden_state[:, 1:, :]
|
82 |
+
|
83 |
+
return last_hidden_state
|
84 |
+
|
85 |
+
def unconditional_embedding(self, batch_size):
|
86 |
+
device = next(self.model.parameters()).device
|
87 |
+
dtype = next(self.model.parameters()).dtype
|
88 |
+
zero = torch.zeros(
|
89 |
+
batch_size,
|
90 |
+
self.num_patches,
|
91 |
+
self.model.config.hidden_size,
|
92 |
+
device=device,
|
93 |
+
dtype=dtype,
|
94 |
+
)
|
95 |
+
|
96 |
+
return zero
|
97 |
+
|
98 |
+
|
99 |
+
class CLIPImageEncoder(ImageEncoder):
|
100 |
+
MODEL_CLASS = CLIPVisionModelWithProjection
|
101 |
+
MODEL_CONFIG_CLASS = CLIPVisionConfig
|
102 |
+
mean = [0.48145466, 0.4578275, 0.40821073]
|
103 |
+
std = [0.26862954, 0.26130258, 0.27577711]
|
104 |
+
|
105 |
+
|
106 |
+
class DinoImageEncoder(ImageEncoder):
|
107 |
+
MODEL_CLASS = Dinov2Model
|
108 |
+
MODEL_CONFIG_CLASS = Dinov2Config
|
109 |
+
mean = [0.485, 0.456, 0.406]
|
110 |
+
std = [0.229, 0.224, 0.225]
|
111 |
+
|
112 |
+
|
113 |
+
def build_image_encoder(config):
|
114 |
+
if config['type'] == 'CLIPImageEncoder':
|
115 |
+
return CLIPImageEncoder(**config['kwargs'])
|
116 |
+
elif config['type'] == 'DinoImageEncoder':
|
117 |
+
return DinoImageEncoder(**config['kwargs'])
|
118 |
+
else:
|
119 |
+
raise ValueError(f'Unknown image encoder type: {config["type"]}')
|
120 |
+
|
121 |
+
|
122 |
+
class DualImageEncoder(nn.Module):
|
123 |
+
def __init__(
|
124 |
+
self,
|
125 |
+
main_image_encoder,
|
126 |
+
additional_image_encoder,
|
127 |
+
):
|
128 |
+
super().__init__()
|
129 |
+
self.main_image_encoder = build_image_encoder(main_image_encoder)
|
130 |
+
self.additional_image_encoder = build_image_encoder(additional_image_encoder)
|
131 |
+
|
132 |
+
def forward(self, image, mask=None):
|
133 |
+
outputs = {
|
134 |
+
'main': self.main_image_encoder(image, mask=mask),
|
135 |
+
'additional': self.additional_image_encoder(image, mask=mask),
|
136 |
+
}
|
137 |
+
return outputs
|
138 |
+
|
139 |
+
def unconditional_embedding(self, batch_size):
|
140 |
+
outputs = {
|
141 |
+
'main': self.main_image_encoder.unconditional_embedding(batch_size),
|
142 |
+
'additional': self.additional_image_encoder.unconditional_embedding(batch_size),
|
143 |
+
}
|
144 |
+
return outputs
|
145 |
+
|
146 |
+
|
147 |
+
class SingleImageEncoder(nn.Module):
|
148 |
+
def __init__(
|
149 |
+
self,
|
150 |
+
main_image_encoder,
|
151 |
+
):
|
152 |
+
super().__init__()
|
153 |
+
self.main_image_encoder = build_image_encoder(main_image_encoder)
|
154 |
+
|
155 |
+
def forward(self, image, mask=None):
|
156 |
+
outputs = {
|
157 |
+
'main': self.main_image_encoder(image, mask=mask),
|
158 |
+
}
|
159 |
+
return outputs
|
160 |
+
|
161 |
+
def unconditional_embedding(self, batch_size):
|
162 |
+
outputs = {
|
163 |
+
'main': self.main_image_encoder.unconditional_embedding(batch_size),
|
164 |
+
}
|
165 |
+
return outputs
|
hy3dgen/shapegen/models/hunyuan3ddit.py
ADDED
@@ -0,0 +1,390 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Open Source Model Licensed under the Apache License Version 2.0
|
2 |
+
# and Other Licenses of the Third-Party Components therein:
|
3 |
+
# The below Model in this distribution may have been modified by THL A29 Limited
|
4 |
+
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
|
5 |
+
|
6 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
7 |
+
# The below software and/or models in this distribution may have been
|
8 |
+
# modified by THL A29 Limited ("Tencent Modifications").
|
9 |
+
# All Tencent Modifications are Copyright (C) THL A29 Limited.
|
10 |
+
|
11 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
12 |
+
# except for the third-party components listed below.
|
13 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
14 |
+
# in the repsective licenses of these third-party components.
|
15 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
16 |
+
# components and must ensure that the usage of the third party components adheres to
|
17 |
+
# all relevant laws and regulations.
|
18 |
+
|
19 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
20 |
+
# their software and algorithms, including trained model weights, parameters (including
|
21 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
22 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
23 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
24 |
+
|
25 |
+
import math
|
26 |
+
from dataclasses import dataclass
|
27 |
+
from typing import List, Tuple, Optional
|
28 |
+
|
29 |
+
import torch
|
30 |
+
from einops import rearrange
|
31 |
+
from torch import Tensor, nn
|
32 |
+
|
33 |
+
|
34 |
+
def attention(q: Tensor, k: Tensor, v: Tensor, **kwargs) -> Tensor:
|
35 |
+
x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
|
36 |
+
x = rearrange(x, "B H L D -> B L (H D)")
|
37 |
+
return x
|
38 |
+
|
39 |
+
|
40 |
+
def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 1000.0):
|
41 |
+
"""
|
42 |
+
Create sinusoidal timestep embeddings.
|
43 |
+
:param t: a 1-D Tensor of N indices, one per batch element.
|
44 |
+
These may be fractional.
|
45 |
+
:param dim: the dimension of the output.
|
46 |
+
:param max_period: controls the minimum frequency of the embeddings.
|
47 |
+
:return: an (N, D) Tensor of positional embeddings.
|
48 |
+
"""
|
49 |
+
t = time_factor * t
|
50 |
+
half = dim // 2
|
51 |
+
freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(
|
52 |
+
t.device
|
53 |
+
)
|
54 |
+
|
55 |
+
args = t[:, None].float() * freqs[None]
|
56 |
+
embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
|
57 |
+
if dim % 2:
|
58 |
+
embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
|
59 |
+
if torch.is_floating_point(t):
|
60 |
+
embedding = embedding.to(t)
|
61 |
+
return embedding
|
62 |
+
|
63 |
+
|
64 |
+
class MLPEmbedder(nn.Module):
|
65 |
+
def __init__(self, in_dim: int, hidden_dim: int):
|
66 |
+
super().__init__()
|
67 |
+
self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
|
68 |
+
self.silu = nn.SiLU()
|
69 |
+
self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
|
70 |
+
|
71 |
+
def forward(self, x: Tensor) -> Tensor:
|
72 |
+
return self.out_layer(self.silu(self.in_layer(x)))
|
73 |
+
|
74 |
+
|
75 |
+
class RMSNorm(torch.nn.Module):
|
76 |
+
def __init__(self, dim: int):
|
77 |
+
super().__init__()
|
78 |
+
self.scale = nn.Parameter(torch.ones(dim))
|
79 |
+
|
80 |
+
def forward(self, x: Tensor):
|
81 |
+
x_dtype = x.dtype
|
82 |
+
x = x.float()
|
83 |
+
rrms = torch.rsqrt(torch.mean(x ** 2, dim=-1, keepdim=True) + 1e-6)
|
84 |
+
return (x * rrms).to(dtype=x_dtype) * self.scale
|
85 |
+
|
86 |
+
|
87 |
+
class QKNorm(torch.nn.Module):
|
88 |
+
def __init__(self, dim: int):
|
89 |
+
super().__init__()
|
90 |
+
self.query_norm = RMSNorm(dim)
|
91 |
+
self.key_norm = RMSNorm(dim)
|
92 |
+
|
93 |
+
def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tuple[Tensor, Tensor]:
|
94 |
+
q = self.query_norm(q)
|
95 |
+
k = self.key_norm(k)
|
96 |
+
return q.to(v), k.to(v)
|
97 |
+
|
98 |
+
|
99 |
+
class SelfAttention(nn.Module):
|
100 |
+
def __init__(
|
101 |
+
self,
|
102 |
+
dim: int,
|
103 |
+
num_heads: int = 8,
|
104 |
+
qkv_bias: bool = False,
|
105 |
+
):
|
106 |
+
super().__init__()
|
107 |
+
self.num_heads = num_heads
|
108 |
+
head_dim = dim // num_heads
|
109 |
+
|
110 |
+
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
|
111 |
+
self.norm = QKNorm(head_dim)
|
112 |
+
self.proj = nn.Linear(dim, dim)
|
113 |
+
|
114 |
+
def forward(self, x: Tensor, pe: Tensor) -> Tensor:
|
115 |
+
qkv = self.qkv(x)
|
116 |
+
q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
|
117 |
+
q, k = self.norm(q, k, v)
|
118 |
+
x = attention(q, k, v, pe=pe)
|
119 |
+
x = self.proj(x)
|
120 |
+
return x
|
121 |
+
|
122 |
+
|
123 |
+
@dataclass
|
124 |
+
class ModulationOut:
|
125 |
+
shift: Tensor
|
126 |
+
scale: Tensor
|
127 |
+
gate: Tensor
|
128 |
+
|
129 |
+
|
130 |
+
class Modulation(nn.Module):
|
131 |
+
def __init__(self, dim: int, double: bool):
|
132 |
+
super().__init__()
|
133 |
+
self.is_double = double
|
134 |
+
self.multiplier = 6 if double else 3
|
135 |
+
self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)
|
136 |
+
|
137 |
+
def forward(self, vec: Tensor) -> Tuple[ModulationOut, Optional[ModulationOut]]:
|
138 |
+
out = self.lin(nn.functional.silu(vec))[:, None, :]
|
139 |
+
out = out.chunk(self.multiplier, dim=-1)
|
140 |
+
|
141 |
+
return (
|
142 |
+
ModulationOut(*out[:3]),
|
143 |
+
ModulationOut(*out[3:]) if self.is_double else None,
|
144 |
+
)
|
145 |
+
|
146 |
+
|
147 |
+
class DoubleStreamBlock(nn.Module):
|
148 |
+
def __init__(
|
149 |
+
self,
|
150 |
+
hidden_size: int,
|
151 |
+
num_heads: int,
|
152 |
+
mlp_ratio: float,
|
153 |
+
qkv_bias: bool = False,
|
154 |
+
):
|
155 |
+
super().__init__()
|
156 |
+
mlp_hidden_dim = int(hidden_size * mlp_ratio)
|
157 |
+
self.num_heads = num_heads
|
158 |
+
self.hidden_size = hidden_size
|
159 |
+
self.img_mod = Modulation(hidden_size, double=True)
|
160 |
+
self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
|
161 |
+
self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
|
162 |
+
|
163 |
+
self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
|
164 |
+
self.img_mlp = nn.Sequential(
|
165 |
+
nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
|
166 |
+
nn.GELU(approximate="tanh"),
|
167 |
+
nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
|
168 |
+
)
|
169 |
+
|
170 |
+
self.txt_mod = Modulation(hidden_size, double=True)
|
171 |
+
self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
|
172 |
+
self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
|
173 |
+
|
174 |
+
self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
|
175 |
+
self.txt_mlp = nn.Sequential(
|
176 |
+
nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
|
177 |
+
nn.GELU(approximate="tanh"),
|
178 |
+
nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
|
179 |
+
)
|
180 |
+
|
181 |
+
def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor) -> Tuple[Tensor, Tensor]:
|
182 |
+
img_mod1, img_mod2 = self.img_mod(vec)
|
183 |
+
txt_mod1, txt_mod2 = self.txt_mod(vec)
|
184 |
+
|
185 |
+
img_modulated = self.img_norm1(img)
|
186 |
+
img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
|
187 |
+
img_qkv = self.img_attn.qkv(img_modulated)
|
188 |
+
img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
|
189 |
+
img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
|
190 |
+
|
191 |
+
txt_modulated = self.txt_norm1(txt)
|
192 |
+
txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
|
193 |
+
txt_qkv = self.txt_attn.qkv(txt_modulated)
|
194 |
+
txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
|
195 |
+
txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
|
196 |
+
|
197 |
+
q = torch.cat((txt_q, img_q), dim=2)
|
198 |
+
k = torch.cat((txt_k, img_k), dim=2)
|
199 |
+
v = torch.cat((txt_v, img_v), dim=2)
|
200 |
+
|
201 |
+
attn = attention(q, k, v, pe=pe)
|
202 |
+
txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1]:]
|
203 |
+
|
204 |
+
img = img + img_mod1.gate * self.img_attn.proj(img_attn)
|
205 |
+
img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)
|
206 |
+
|
207 |
+
txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
|
208 |
+
txt = txt + txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
|
209 |
+
return img, txt
|
210 |
+
|
211 |
+
|
212 |
+
class SingleStreamBlock(nn.Module):
|
213 |
+
"""
|
214 |
+
A DiT block with parallel linear layers as described in
|
215 |
+
https://arxiv.org/abs/2302.05442 and adapted modulation interface.
|
216 |
+
"""
|
217 |
+
|
218 |
+
def __init__(
|
219 |
+
self,
|
220 |
+
hidden_size: int,
|
221 |
+
num_heads: int,
|
222 |
+
mlp_ratio: float = 4.0,
|
223 |
+
qk_scale: Optional[float] = None,
|
224 |
+
):
|
225 |
+
super().__init__()
|
226 |
+
|
227 |
+
self.hidden_dim = hidden_size
|
228 |
+
self.num_heads = num_heads
|
229 |
+
head_dim = hidden_size // num_heads
|
230 |
+
self.scale = qk_scale or head_dim ** -0.5
|
231 |
+
|
232 |
+
self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
|
233 |
+
# qkv and mlp_in
|
234 |
+
self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim)
|
235 |
+
# proj and mlp_out
|
236 |
+
self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
|
237 |
+
|
238 |
+
self.norm = QKNorm(head_dim)
|
239 |
+
|
240 |
+
self.hidden_size = hidden_size
|
241 |
+
self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
|
242 |
+
|
243 |
+
self.mlp_act = nn.GELU(approximate="tanh")
|
244 |
+
self.modulation = Modulation(hidden_size, double=False)
|
245 |
+
|
246 |
+
def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
|
247 |
+
mod, _ = self.modulation(vec)
|
248 |
+
|
249 |
+
x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
|
250 |
+
qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
|
251 |
+
|
252 |
+
q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
|
253 |
+
q, k = self.norm(q, k, v)
|
254 |
+
|
255 |
+
# compute attention
|
256 |
+
attn = attention(q, k, v, pe=pe)
|
257 |
+
# compute activation in mlp stream, cat again and run second linear layer
|
258 |
+
output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
|
259 |
+
return x + mod.gate * output
|
260 |
+
|
261 |
+
|
262 |
+
class LastLayer(nn.Module):
|
263 |
+
def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
|
264 |
+
super().__init__()
|
265 |
+
self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
|
266 |
+
self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
|
267 |
+
self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True))
|
268 |
+
|
269 |
+
def forward(self, x: Tensor, vec: Tensor) -> Tensor:
|
270 |
+
shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
|
271 |
+
x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
|
272 |
+
x = self.linear(x)
|
273 |
+
return x
|
274 |
+
|
275 |
+
|
276 |
+
class Hunyuan3DDiT(nn.Module):
|
277 |
+
def __init__(
|
278 |
+
self,
|
279 |
+
in_channels: int = 64,
|
280 |
+
context_in_dim: int = 1536,
|
281 |
+
hidden_size: int = 1024,
|
282 |
+
mlp_ratio: float = 4.0,
|
283 |
+
num_heads: int = 16,
|
284 |
+
depth: int = 16,
|
285 |
+
depth_single_blocks: int = 32,
|
286 |
+
axes_dim: List[int] = [64],
|
287 |
+
theta: int = 10_000,
|
288 |
+
qkv_bias: bool = True,
|
289 |
+
time_factor: float = 1000,
|
290 |
+
ckpt_path: Optional[str] = None,
|
291 |
+
**kwargs,
|
292 |
+
):
|
293 |
+
super().__init__()
|
294 |
+
self.in_channels = in_channels
|
295 |
+
self.context_in_dim = context_in_dim
|
296 |
+
self.hidden_size = hidden_size
|
297 |
+
self.mlp_ratio = mlp_ratio
|
298 |
+
self.num_heads = num_heads
|
299 |
+
self.depth = depth
|
300 |
+
self.depth_single_blocks = depth_single_blocks
|
301 |
+
self.axes_dim = axes_dim
|
302 |
+
self.theta = theta
|
303 |
+
self.qkv_bias = qkv_bias
|
304 |
+
self.time_factor = time_factor
|
305 |
+
self.out_channels = self.in_channels
|
306 |
+
|
307 |
+
if hidden_size % num_heads != 0:
|
308 |
+
raise ValueError(
|
309 |
+
f"Hidden size {hidden_size} must be divisible by num_heads {num_heads}"
|
310 |
+
)
|
311 |
+
pe_dim = hidden_size // num_heads
|
312 |
+
if sum(axes_dim) != pe_dim:
|
313 |
+
raise ValueError(f"Got {axes_dim} but expected positional dim {pe_dim}")
|
314 |
+
self.hidden_size = hidden_size
|
315 |
+
self.num_heads = num_heads
|
316 |
+
self.latent_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
|
317 |
+
self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
|
318 |
+
self.cond_in = nn.Linear(context_in_dim, self.hidden_size)
|
319 |
+
|
320 |
+
self.double_blocks = nn.ModuleList(
|
321 |
+
[
|
322 |
+
DoubleStreamBlock(
|
323 |
+
self.hidden_size,
|
324 |
+
self.num_heads,
|
325 |
+
mlp_ratio=mlp_ratio,
|
326 |
+
qkv_bias=qkv_bias,
|
327 |
+
)
|
328 |
+
for _ in range(depth)
|
329 |
+
]
|
330 |
+
)
|
331 |
+
|
332 |
+
self.single_blocks = nn.ModuleList(
|
333 |
+
[
|
334 |
+
SingleStreamBlock(
|
335 |
+
self.hidden_size,
|
336 |
+
self.num_heads,
|
337 |
+
mlp_ratio=mlp_ratio,
|
338 |
+
)
|
339 |
+
for _ in range(depth_single_blocks)
|
340 |
+
]
|
341 |
+
)
|
342 |
+
|
343 |
+
self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
|
344 |
+
|
345 |
+
if ckpt_path is not None:
|
346 |
+
print('restored denoiser ckpt', ckpt_path)
|
347 |
+
|
348 |
+
ckpt = torch.load(ckpt_path, map_location="cpu")
|
349 |
+
if 'state_dict' not in ckpt:
|
350 |
+
# deepspeed ckpt
|
351 |
+
state_dict = {}
|
352 |
+
for k in ckpt.keys():
|
353 |
+
new_k = k.replace('_forward_module.', '')
|
354 |
+
state_dict[new_k] = ckpt[k]
|
355 |
+
else:
|
356 |
+
state_dict = ckpt["state_dict"]
|
357 |
+
|
358 |
+
final_state_dict = {}
|
359 |
+
for k, v in state_dict.items():
|
360 |
+
if k.startswith('model.'):
|
361 |
+
final_state_dict[k.replace('model.', '')] = v
|
362 |
+
else:
|
363 |
+
final_state_dict[k] = v
|
364 |
+
missing, unexpected = self.load_state_dict(final_state_dict, strict=False)
|
365 |
+
print('unexpected keys:', unexpected)
|
366 |
+
print('missing keys:', missing)
|
367 |
+
|
368 |
+
def forward(
|
369 |
+
self,
|
370 |
+
x,
|
371 |
+
t,
|
372 |
+
contexts,
|
373 |
+
**kwargs,
|
374 |
+
) -> Tensor:
|
375 |
+
cond = contexts['main']
|
376 |
+
latent = self.latent_in(x)
|
377 |
+
vec = self.time_in(timestep_embedding(t, 256, self.time_factor).to(dtype=latent.dtype))
|
378 |
+
cond = self.cond_in(cond)
|
379 |
+
pe = None
|
380 |
+
|
381 |
+
for block in self.double_blocks:
|
382 |
+
latent, cond = block(img=latent, txt=cond, vec=vec, pe=pe)
|
383 |
+
|
384 |
+
latent = torch.cat((cond, latent), 1)
|
385 |
+
for block in self.single_blocks:
|
386 |
+
latent = block(latent, vec=vec, pe=pe)
|
387 |
+
|
388 |
+
latent = latent[:, cond.shape[1]:, ...]
|
389 |
+
latent = self.final_layer(latent, vec)
|
390 |
+
return latent
|
hy3dgen/shapegen/models/vae.py
ADDED
@@ -0,0 +1,636 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Open Source Model Licensed under the Apache License Version 2.0
|
2 |
+
# and Other Licenses of the Third-Party Components therein:
|
3 |
+
# The below Model in this distribution may have been modified by THL A29 Limited
|
4 |
+
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
|
5 |
+
|
6 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
7 |
+
# The below software and/or models in this distribution may have been
|
8 |
+
# modified by THL A29 Limited ("Tencent Modifications").
|
9 |
+
# All Tencent Modifications are Copyright (C) THL A29 Limited.
|
10 |
+
|
11 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
12 |
+
# except for the third-party components listed below.
|
13 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
14 |
+
# in the repsective licenses of these third-party components.
|
15 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
16 |
+
# components and must ensure that the usage of the third party components adheres to
|
17 |
+
# all relevant laws and regulations.
|
18 |
+
|
19 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
20 |
+
# their software and algorithms, including trained model weights, parameters (including
|
21 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
22 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
23 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
24 |
+
|
25 |
+
from typing import Tuple, List, Union, Optional
|
26 |
+
|
27 |
+
import numpy as np
|
28 |
+
import torch
|
29 |
+
import torch.nn as nn
|
30 |
+
import torch.nn.functional as F
|
31 |
+
from einops import rearrange, repeat
|
32 |
+
from skimage import measure
|
33 |
+
from tqdm import tqdm
|
34 |
+
|
35 |
+
|
36 |
+
class FourierEmbedder(nn.Module):
|
37 |
+
"""The sin/cosine positional embedding. Given an input tensor `x` of shape [n_batch, ..., c_dim], it converts
|
38 |
+
each feature dimension of `x[..., i]` into:
|
39 |
+
[
|
40 |
+
sin(x[..., i]),
|
41 |
+
sin(f_1*x[..., i]),
|
42 |
+
sin(f_2*x[..., i]),
|
43 |
+
...
|
44 |
+
sin(f_N * x[..., i]),
|
45 |
+
cos(x[..., i]),
|
46 |
+
cos(f_1*x[..., i]),
|
47 |
+
cos(f_2*x[..., i]),
|
48 |
+
...
|
49 |
+
cos(f_N * x[..., i]),
|
50 |
+
x[..., i] # only present if include_input is True.
|
51 |
+
], here f_i is the frequency.
|
52 |
+
|
53 |
+
Denote the space is [0 / num_freqs, 1 / num_freqs, 2 / num_freqs, 3 / num_freqs, ..., (num_freqs - 1) / num_freqs].
|
54 |
+
If logspace is True, then the frequency f_i is [2^(0 / num_freqs), ..., 2^(i / num_freqs), ...];
|
55 |
+
Otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1)].
|
56 |
+
|
57 |
+
Args:
|
58 |
+
num_freqs (int): the number of frequencies, default is 6;
|
59 |
+
logspace (bool): If logspace is True, then the frequency f_i is [..., 2^(i / num_freqs), ...],
|
60 |
+
otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1)];
|
61 |
+
input_dim (int): the input dimension, default is 3;
|
62 |
+
include_input (bool): include the input tensor or not, default is True.
|
63 |
+
|
64 |
+
Attributes:
|
65 |
+
frequencies (torch.Tensor): If logspace is True, then the frequency f_i is [..., 2^(i / num_freqs), ...],
|
66 |
+
otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1);
|
67 |
+
|
68 |
+
out_dim (int): the embedding size, if include_input is True, it is input_dim * (num_freqs * 2 + 1),
|
69 |
+
otherwise, it is input_dim * num_freqs * 2.
|
70 |
+
|
71 |
+
"""
|
72 |
+
|
73 |
+
def __init__(self,
|
74 |
+
num_freqs: int = 6,
|
75 |
+
logspace: bool = True,
|
76 |
+
input_dim: int = 3,
|
77 |
+
include_input: bool = True,
|
78 |
+
include_pi: bool = True) -> None:
|
79 |
+
|
80 |
+
"""The initialization"""
|
81 |
+
|
82 |
+
super().__init__()
|
83 |
+
|
84 |
+
if logspace:
|
85 |
+
frequencies = 2.0 ** torch.arange(
|
86 |
+
num_freqs,
|
87 |
+
dtype=torch.float32
|
88 |
+
)
|
89 |
+
else:
|
90 |
+
frequencies = torch.linspace(
|
91 |
+
1.0,
|
92 |
+
2.0 ** (num_freqs - 1),
|
93 |
+
num_freqs,
|
94 |
+
dtype=torch.float32
|
95 |
+
)
|
96 |
+
|
97 |
+
if include_pi:
|
98 |
+
frequencies *= torch.pi
|
99 |
+
|
100 |
+
self.register_buffer("frequencies", frequencies, persistent=False)
|
101 |
+
self.include_input = include_input
|
102 |
+
self.num_freqs = num_freqs
|
103 |
+
|
104 |
+
self.out_dim = self.get_dims(input_dim)
|
105 |
+
|
106 |
+
def get_dims(self, input_dim):
|
107 |
+
temp = 1 if self.include_input or self.num_freqs == 0 else 0
|
108 |
+
out_dim = input_dim * (self.num_freqs * 2 + temp)
|
109 |
+
|
110 |
+
return out_dim
|
111 |
+
|
112 |
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
113 |
+
""" Forward process.
|
114 |
+
|
115 |
+
Args:
|
116 |
+
x: tensor of shape [..., dim]
|
117 |
+
|
118 |
+
Returns:
|
119 |
+
embedding: an embedding of `x` of shape [..., dim * (num_freqs * 2 + temp)]
|
120 |
+
where temp is 1 if include_input is True and 0 otherwise.
|
121 |
+
"""
|
122 |
+
|
123 |
+
if self.num_freqs > 0:
|
124 |
+
embed = (x[..., None].contiguous() * self.frequencies).view(*x.shape[:-1], -1)
|
125 |
+
if self.include_input:
|
126 |
+
return torch.cat((x, embed.sin(), embed.cos()), dim=-1)
|
127 |
+
else:
|
128 |
+
return torch.cat((embed.sin(), embed.cos()), dim=-1)
|
129 |
+
else:
|
130 |
+
return x
|
131 |
+
|
132 |
+
|
133 |
+
class DropPath(nn.Module):
|
134 |
+
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
|
135 |
+
"""
|
136 |
+
|
137 |
+
def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True):
|
138 |
+
super(DropPath, self).__init__()
|
139 |
+
self.drop_prob = drop_prob
|
140 |
+
self.scale_by_keep = scale_by_keep
|
141 |
+
|
142 |
+
def forward(self, x):
|
143 |
+
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
|
144 |
+
|
145 |
+
This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
|
146 |
+
the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
|
147 |
+
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
|
148 |
+
changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
|
149 |
+
'survival rate' as the argument.
|
150 |
+
|
151 |
+
"""
|
152 |
+
if self.drop_prob == 0. or not self.training:
|
153 |
+
return x
|
154 |
+
keep_prob = 1 - self.drop_prob
|
155 |
+
shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
|
156 |
+
random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
|
157 |
+
if keep_prob > 0.0 and self.scale_by_keep:
|
158 |
+
random_tensor.div_(keep_prob)
|
159 |
+
return x * random_tensor
|
160 |
+
|
161 |
+
def extra_repr(self):
|
162 |
+
return f'drop_prob={round(self.drop_prob, 3):0.3f}'
|
163 |
+
|
164 |
+
|
165 |
+
class MLP(nn.Module):
|
166 |
+
def __init__(
|
167 |
+
self, *,
|
168 |
+
width: int,
|
169 |
+
output_width: int = None,
|
170 |
+
drop_path_rate: float = 0.0
|
171 |
+
):
|
172 |
+
super().__init__()
|
173 |
+
self.width = width
|
174 |
+
self.c_fc = nn.Linear(width, width * 4)
|
175 |
+
self.c_proj = nn.Linear(width * 4, output_width if output_width is not None else width)
|
176 |
+
self.gelu = nn.GELU()
|
177 |
+
self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
|
178 |
+
|
179 |
+
def forward(self, x):
|
180 |
+
return self.drop_path(self.c_proj(self.gelu(self.c_fc(x))))
|
181 |
+
|
182 |
+
|
183 |
+
class QKVMultiheadCrossAttention(nn.Module):
|
184 |
+
def __init__(
|
185 |
+
self,
|
186 |
+
*,
|
187 |
+
heads: int,
|
188 |
+
n_data: Optional[int] = None,
|
189 |
+
width=None,
|
190 |
+
qk_norm=False,
|
191 |
+
norm_layer=nn.LayerNorm
|
192 |
+
):
|
193 |
+
super().__init__()
|
194 |
+
self.heads = heads
|
195 |
+
self.n_data = n_data
|
196 |
+
self.q_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
|
197 |
+
self.k_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
|
198 |
+
|
199 |
+
def forward(self, q, kv):
|
200 |
+
_, n_ctx, _ = q.shape
|
201 |
+
bs, n_data, width = kv.shape
|
202 |
+
attn_ch = width // self.heads // 2
|
203 |
+
q = q.view(bs, n_ctx, self.heads, -1)
|
204 |
+
kv = kv.view(bs, n_data, self.heads, -1)
|
205 |
+
k, v = torch.split(kv, attn_ch, dim=-1)
|
206 |
+
|
207 |
+
q = self.q_norm(q)
|
208 |
+
k = self.k_norm(k)
|
209 |
+
|
210 |
+
q, k, v = map(lambda t: rearrange(t, 'b n h d -> b h n d', h=self.heads), (q, k, v))
|
211 |
+
out = F.scaled_dot_product_attention(q, k, v).transpose(1, 2).reshape(bs, n_ctx, -1)
|
212 |
+
|
213 |
+
return out
|
214 |
+
|
215 |
+
|
216 |
+
class MultiheadCrossAttention(nn.Module):
|
217 |
+
def __init__(
|
218 |
+
self,
|
219 |
+
*,
|
220 |
+
width: int,
|
221 |
+
heads: int,
|
222 |
+
qkv_bias: bool = True,
|
223 |
+
n_data: Optional[int] = None,
|
224 |
+
data_width: Optional[int] = None,
|
225 |
+
norm_layer=nn.LayerNorm,
|
226 |
+
qk_norm: bool = False
|
227 |
+
):
|
228 |
+
super().__init__()
|
229 |
+
self.n_data = n_data
|
230 |
+
self.width = width
|
231 |
+
self.heads = heads
|
232 |
+
self.data_width = width if data_width is None else data_width
|
233 |
+
self.c_q = nn.Linear(width, width, bias=qkv_bias)
|
234 |
+
self.c_kv = nn.Linear(self.data_width, width * 2, bias=qkv_bias)
|
235 |
+
self.c_proj = nn.Linear(width, width)
|
236 |
+
self.attention = QKVMultiheadCrossAttention(
|
237 |
+
heads=heads,
|
238 |
+
n_data=n_data,
|
239 |
+
width=width,
|
240 |
+
norm_layer=norm_layer,
|
241 |
+
qk_norm=qk_norm
|
242 |
+
)
|
243 |
+
|
244 |
+
def forward(self, x, data):
|
245 |
+
x = self.c_q(x)
|
246 |
+
data = self.c_kv(data)
|
247 |
+
x = self.attention(x, data)
|
248 |
+
x = self.c_proj(x)
|
249 |
+
return x
|
250 |
+
|
251 |
+
|
252 |
+
class ResidualCrossAttentionBlock(nn.Module):
|
253 |
+
def __init__(
|
254 |
+
self,
|
255 |
+
*,
|
256 |
+
n_data: Optional[int] = None,
|
257 |
+
width: int,
|
258 |
+
heads: int,
|
259 |
+
data_width: Optional[int] = None,
|
260 |
+
qkv_bias: bool = True,
|
261 |
+
norm_layer=nn.LayerNorm,
|
262 |
+
qk_norm: bool = False
|
263 |
+
):
|
264 |
+
super().__init__()
|
265 |
+
|
266 |
+
if data_width is None:
|
267 |
+
data_width = width
|
268 |
+
|
269 |
+
self.attn = MultiheadCrossAttention(
|
270 |
+
n_data=n_data,
|
271 |
+
width=width,
|
272 |
+
heads=heads,
|
273 |
+
data_width=data_width,
|
274 |
+
qkv_bias=qkv_bias,
|
275 |
+
norm_layer=norm_layer,
|
276 |
+
qk_norm=qk_norm
|
277 |
+
)
|
278 |
+
self.ln_1 = norm_layer(width, elementwise_affine=True, eps=1e-6)
|
279 |
+
self.ln_2 = norm_layer(data_width, elementwise_affine=True, eps=1e-6)
|
280 |
+
self.ln_3 = norm_layer(width, elementwise_affine=True, eps=1e-6)
|
281 |
+
self.mlp = MLP(width=width)
|
282 |
+
|
283 |
+
def forward(self, x: torch.Tensor, data: torch.Tensor):
|
284 |
+
x = x + self.attn(self.ln_1(x), self.ln_2(data))
|
285 |
+
x = x + self.mlp(self.ln_3(x))
|
286 |
+
return x
|
287 |
+
|
288 |
+
|
289 |
+
class QKVMultiheadAttention(nn.Module):
|
290 |
+
def __init__(
|
291 |
+
self,
|
292 |
+
*,
|
293 |
+
heads: int,
|
294 |
+
n_ctx: int,
|
295 |
+
width=None,
|
296 |
+
qk_norm=False,
|
297 |
+
norm_layer=nn.LayerNorm
|
298 |
+
):
|
299 |
+
super().__init__()
|
300 |
+
self.heads = heads
|
301 |
+
self.n_ctx = n_ctx
|
302 |
+
self.q_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
|
303 |
+
self.k_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
|
304 |
+
|
305 |
+
def forward(self, qkv):
|
306 |
+
bs, n_ctx, width = qkv.shape
|
307 |
+
attn_ch = width // self.heads // 3
|
308 |
+
qkv = qkv.view(bs, n_ctx, self.heads, -1)
|
309 |
+
q, k, v = torch.split(qkv, attn_ch, dim=-1)
|
310 |
+
|
311 |
+
q = self.q_norm(q)
|
312 |
+
k = self.k_norm(k)
|
313 |
+
|
314 |
+
q, k, v = map(lambda t: rearrange(t, 'b n h d -> b h n d', h=self.heads), (q, k, v))
|
315 |
+
out = F.scaled_dot_product_attention(q, k, v).transpose(1, 2).reshape(bs, n_ctx, -1)
|
316 |
+
return out
|
317 |
+
|
318 |
+
|
319 |
+
class MultiheadAttention(nn.Module):
|
320 |
+
def __init__(
|
321 |
+
self,
|
322 |
+
*,
|
323 |
+
n_ctx: int,
|
324 |
+
width: int,
|
325 |
+
heads: int,
|
326 |
+
qkv_bias: bool,
|
327 |
+
norm_layer=nn.LayerNorm,
|
328 |
+
qk_norm: bool = False,
|
329 |
+
drop_path_rate: float = 0.0
|
330 |
+
):
|
331 |
+
super().__init__()
|
332 |
+
self.n_ctx = n_ctx
|
333 |
+
self.width = width
|
334 |
+
self.heads = heads
|
335 |
+
self.c_qkv = nn.Linear(width, width * 3, bias=qkv_bias)
|
336 |
+
self.c_proj = nn.Linear(width, width)
|
337 |
+
self.attention = QKVMultiheadAttention(
|
338 |
+
heads=heads,
|
339 |
+
n_ctx=n_ctx,
|
340 |
+
width=width,
|
341 |
+
norm_layer=norm_layer,
|
342 |
+
qk_norm=qk_norm
|
343 |
+
)
|
344 |
+
self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
|
345 |
+
|
346 |
+
def forward(self, x):
|
347 |
+
x = self.c_qkv(x)
|
348 |
+
x = self.attention(x)
|
349 |
+
x = self.drop_path(self.c_proj(x))
|
350 |
+
return x
|
351 |
+
|
352 |
+
|
353 |
+
class ResidualAttentionBlock(nn.Module):
|
354 |
+
def __init__(
|
355 |
+
self,
|
356 |
+
*,
|
357 |
+
n_ctx: int,
|
358 |
+
width: int,
|
359 |
+
heads: int,
|
360 |
+
qkv_bias: bool = True,
|
361 |
+
norm_layer=nn.LayerNorm,
|
362 |
+
qk_norm: bool = False,
|
363 |
+
drop_path_rate: float = 0.0,
|
364 |
+
):
|
365 |
+
super().__init__()
|
366 |
+
self.attn = MultiheadAttention(
|
367 |
+
n_ctx=n_ctx,
|
368 |
+
width=width,
|
369 |
+
heads=heads,
|
370 |
+
qkv_bias=qkv_bias,
|
371 |
+
norm_layer=norm_layer,
|
372 |
+
qk_norm=qk_norm,
|
373 |
+
drop_path_rate=drop_path_rate
|
374 |
+
)
|
375 |
+
self.ln_1 = norm_layer(width, elementwise_affine=True, eps=1e-6)
|
376 |
+
self.mlp = MLP(width=width, drop_path_rate=drop_path_rate)
|
377 |
+
self.ln_2 = norm_layer(width, elementwise_affine=True, eps=1e-6)
|
378 |
+
|
379 |
+
def forward(self, x: torch.Tensor):
|
380 |
+
x = x + self.attn(self.ln_1(x))
|
381 |
+
x = x + self.mlp(self.ln_2(x))
|
382 |
+
return x
|
383 |
+
|
384 |
+
|
385 |
+
class Transformer(nn.Module):
|
386 |
+
def __init__(
|
387 |
+
self,
|
388 |
+
*,
|
389 |
+
n_ctx: int,
|
390 |
+
width: int,
|
391 |
+
layers: int,
|
392 |
+
heads: int,
|
393 |
+
qkv_bias: bool = True,
|
394 |
+
norm_layer=nn.LayerNorm,
|
395 |
+
qk_norm: bool = False,
|
396 |
+
drop_path_rate: float = 0.0
|
397 |
+
):
|
398 |
+
super().__init__()
|
399 |
+
self.n_ctx = n_ctx
|
400 |
+
self.width = width
|
401 |
+
self.layers = layers
|
402 |
+
self.resblocks = nn.ModuleList(
|
403 |
+
[
|
404 |
+
ResidualAttentionBlock(
|
405 |
+
n_ctx=n_ctx,
|
406 |
+
width=width,
|
407 |
+
heads=heads,
|
408 |
+
qkv_bias=qkv_bias,
|
409 |
+
norm_layer=norm_layer,
|
410 |
+
qk_norm=qk_norm,
|
411 |
+
drop_path_rate=drop_path_rate
|
412 |
+
)
|
413 |
+
for _ in range(layers)
|
414 |
+
]
|
415 |
+
)
|
416 |
+
|
417 |
+
def forward(self, x: torch.Tensor):
|
418 |
+
for block in self.resblocks:
|
419 |
+
x = block(x)
|
420 |
+
return x
|
421 |
+
|
422 |
+
|
423 |
+
class CrossAttentionDecoder(nn.Module):
|
424 |
+
|
425 |
+
def __init__(
|
426 |
+
self,
|
427 |
+
*,
|
428 |
+
num_latents: int,
|
429 |
+
out_channels: int,
|
430 |
+
fourier_embedder: FourierEmbedder,
|
431 |
+
width: int,
|
432 |
+
heads: int,
|
433 |
+
qkv_bias: bool = True,
|
434 |
+
qk_norm: bool = False,
|
435 |
+
label_type: str = "binary"
|
436 |
+
):
|
437 |
+
super().__init__()
|
438 |
+
|
439 |
+
self.fourier_embedder = fourier_embedder
|
440 |
+
|
441 |
+
self.query_proj = nn.Linear(self.fourier_embedder.out_dim, width)
|
442 |
+
|
443 |
+
self.cross_attn_decoder = ResidualCrossAttentionBlock(
|
444 |
+
n_data=num_latents,
|
445 |
+
width=width,
|
446 |
+
heads=heads,
|
447 |
+
qkv_bias=qkv_bias,
|
448 |
+
qk_norm=qk_norm
|
449 |
+
)
|
450 |
+
|
451 |
+
self.ln_post = nn.LayerNorm(width)
|
452 |
+
self.output_proj = nn.Linear(width, out_channels)
|
453 |
+
self.label_type = label_type
|
454 |
+
|
455 |
+
def forward(self, queries: torch.FloatTensor, latents: torch.FloatTensor):
|
456 |
+
queries = self.query_proj(self.fourier_embedder(queries).to(latents.dtype))
|
457 |
+
x = self.cross_attn_decoder(queries, latents)
|
458 |
+
x = self.ln_post(x)
|
459 |
+
occ = self.output_proj(x)
|
460 |
+
return occ
|
461 |
+
|
462 |
+
|
463 |
+
def generate_dense_grid_points(bbox_min: np.ndarray,
|
464 |
+
bbox_max: np.ndarray,
|
465 |
+
octree_depth: int,
|
466 |
+
indexing: str = "ij",
|
467 |
+
octree_resolution: int = None,
|
468 |
+
):
|
469 |
+
length = bbox_max - bbox_min
|
470 |
+
num_cells = np.exp2(octree_depth)
|
471 |
+
if octree_resolution is not None:
|
472 |
+
num_cells = octree_resolution
|
473 |
+
|
474 |
+
x = np.linspace(bbox_min[0], bbox_max[0], int(num_cells) + 1, dtype=np.float32)
|
475 |
+
y = np.linspace(bbox_min[1], bbox_max[1], int(num_cells) + 1, dtype=np.float32)
|
476 |
+
z = np.linspace(bbox_min[2], bbox_max[2], int(num_cells) + 1, dtype=np.float32)
|
477 |
+
[xs, ys, zs] = np.meshgrid(x, y, z, indexing=indexing)
|
478 |
+
xyz = np.stack((xs, ys, zs), axis=-1)
|
479 |
+
xyz = xyz.reshape(-1, 3)
|
480 |
+
grid_size = [int(num_cells) + 1, int(num_cells) + 1, int(num_cells) + 1]
|
481 |
+
|
482 |
+
return xyz, grid_size, length
|
483 |
+
|
484 |
+
|
485 |
+
def center_vertices(vertices):
|
486 |
+
"""Translate the vertices so that bounding box is centered at zero."""
|
487 |
+
vert_min = vertices.min(dim=0)[0]
|
488 |
+
vert_max = vertices.max(dim=0)[0]
|
489 |
+
vert_center = 0.5 * (vert_min + vert_max)
|
490 |
+
return vertices - vert_center
|
491 |
+
|
492 |
+
|
493 |
+
class Latent2MeshOutput:
|
494 |
+
|
495 |
+
def __init__(self, mesh_v=None, mesh_f=None):
|
496 |
+
self.mesh_v = mesh_v
|
497 |
+
self.mesh_f = mesh_f
|
498 |
+
|
499 |
+
|
500 |
+
class ShapeVAE(nn.Module):
|
501 |
+
def __init__(
|
502 |
+
self,
|
503 |
+
*,
|
504 |
+
num_latents: int,
|
505 |
+
embed_dim: int,
|
506 |
+
width: int,
|
507 |
+
heads: int,
|
508 |
+
num_decoder_layers: int,
|
509 |
+
num_freqs: int = 8,
|
510 |
+
include_pi: bool = True,
|
511 |
+
qkv_bias: bool = True,
|
512 |
+
qk_norm: bool = False,
|
513 |
+
label_type: str = "binary",
|
514 |
+
drop_path_rate: float = 0.0,
|
515 |
+
scale_factor: float = 1.0,
|
516 |
+
):
|
517 |
+
super().__init__()
|
518 |
+
self.fourier_embedder = FourierEmbedder(num_freqs=num_freqs, include_pi=include_pi)
|
519 |
+
|
520 |
+
self.post_kl = nn.Linear(embed_dim, width)
|
521 |
+
|
522 |
+
self.transformer = Transformer(
|
523 |
+
n_ctx=num_latents,
|
524 |
+
width=width,
|
525 |
+
layers=num_decoder_layers,
|
526 |
+
heads=heads,
|
527 |
+
qkv_bias=qkv_bias,
|
528 |
+
qk_norm=qk_norm,
|
529 |
+
drop_path_rate=drop_path_rate
|
530 |
+
)
|
531 |
+
|
532 |
+
self.geo_decoder = CrossAttentionDecoder(
|
533 |
+
fourier_embedder=self.fourier_embedder,
|
534 |
+
out_channels=1,
|
535 |
+
num_latents=num_latents,
|
536 |
+
width=width,
|
537 |
+
heads=heads,
|
538 |
+
qkv_bias=qkv_bias,
|
539 |
+
qk_norm=qk_norm,
|
540 |
+
label_type=label_type,
|
541 |
+
)
|
542 |
+
|
543 |
+
self.scale_factor = scale_factor
|
544 |
+
self.latent_shape = (num_latents, embed_dim)
|
545 |
+
|
546 |
+
def forward(self, latents):
|
547 |
+
latents = self.post_kl(latents)
|
548 |
+
latents = self.transformer(latents)
|
549 |
+
return latents
|
550 |
+
|
551 |
+
@torch.no_grad()
|
552 |
+
def latents2mesh(
|
553 |
+
self,
|
554 |
+
latents: torch.FloatTensor,
|
555 |
+
bounds: Union[Tuple[float], List[float], float] = 1.1,
|
556 |
+
octree_depth: int = 7,
|
557 |
+
num_chunks: int = 10000,
|
558 |
+
mc_level: float = -1 / 512,
|
559 |
+
octree_resolution: int = None,
|
560 |
+
mc_algo: str = 'dmc',
|
561 |
+
):
|
562 |
+
device = latents.device
|
563 |
+
|
564 |
+
# 1. generate query points
|
565 |
+
if isinstance(bounds, float):
|
566 |
+
bounds = [-bounds, -bounds, -bounds, bounds, bounds, bounds]
|
567 |
+
bbox_min = np.array(bounds[0:3])
|
568 |
+
bbox_max = np.array(bounds[3:6])
|
569 |
+
bbox_size = bbox_max - bbox_min
|
570 |
+
xyz_samples, grid_size, length = generate_dense_grid_points(
|
571 |
+
bbox_min=bbox_min,
|
572 |
+
bbox_max=bbox_max,
|
573 |
+
octree_depth=octree_depth,
|
574 |
+
octree_resolution=octree_resolution,
|
575 |
+
indexing="ij"
|
576 |
+
)
|
577 |
+
xyz_samples = torch.FloatTensor(xyz_samples)
|
578 |
+
|
579 |
+
# 2. latents to 3d volume
|
580 |
+
batch_logits = []
|
581 |
+
batch_size = latents.shape[0]
|
582 |
+
for start in tqdm(range(0, xyz_samples.shape[0], num_chunks),
|
583 |
+
desc=f"MC Level {mc_level} Implicit Function:"):
|
584 |
+
queries = xyz_samples[start: start + num_chunks, :].to(device)
|
585 |
+
queries = queries.half()
|
586 |
+
batch_queries = repeat(queries, "p c -> b p c", b=batch_size)
|
587 |
+
|
588 |
+
logits = self.geo_decoder(batch_queries.to(latents.dtype), latents)
|
589 |
+
if mc_level == -1:
|
590 |
+
mc_level = 0
|
591 |
+
logits = torch.sigmoid(logits) * 2 - 1
|
592 |
+
print(f'Training with soft labels, inference with sigmoid and marching cubes level 0.')
|
593 |
+
batch_logits.append(logits)
|
594 |
+
grid_logits = torch.cat(batch_logits, dim=1)
|
595 |
+
grid_logits = grid_logits.view((batch_size, grid_size[0], grid_size[1], grid_size[2])).float()
|
596 |
+
|
597 |
+
# 3. extract surface
|
598 |
+
outputs = []
|
599 |
+
for i in range(batch_size):
|
600 |
+
try:
|
601 |
+
if mc_algo == 'mc':
|
602 |
+
vertices, faces, normals, _ = measure.marching_cubes(
|
603 |
+
grid_logits[i].cpu().numpy(),
|
604 |
+
mc_level,
|
605 |
+
method="lewiner"
|
606 |
+
)
|
607 |
+
vertices = vertices / grid_size * bbox_size + bbox_min
|
608 |
+
elif mc_algo == 'dmc':
|
609 |
+
if not hasattr(self, 'dmc'):
|
610 |
+
try:
|
611 |
+
from diso import DiffDMC
|
612 |
+
except:
|
613 |
+
raise ImportError("Please install diso via `pip install diso`, or set mc_algo to 'mc'")
|
614 |
+
self.dmc = DiffDMC(dtype=torch.float32).to(device)
|
615 |
+
octree_resolution = 2 ** octree_depth if octree_resolution is None else octree_resolution
|
616 |
+
sdf = -grid_logits[i] / octree_resolution
|
617 |
+
verts, faces = self.dmc(sdf, deform=None, return_quads=False, normalize=True)
|
618 |
+
verts = center_vertices(verts)
|
619 |
+
vertices = verts.detach().cpu().numpy()
|
620 |
+
faces = faces.detach().cpu().numpy()[:, ::-1]
|
621 |
+
else:
|
622 |
+
raise ValueError(f"mc_algo {mc_algo} not supported.")
|
623 |
+
|
624 |
+
outputs.append(
|
625 |
+
Latent2MeshOutput(
|
626 |
+
mesh_v=vertices.astype(np.float32),
|
627 |
+
mesh_f=np.ascontiguousarray(faces)
|
628 |
+
)
|
629 |
+
)
|
630 |
+
|
631 |
+
except ValueError:
|
632 |
+
outputs.append(None)
|
633 |
+
except RuntimeError:
|
634 |
+
outputs.append(None)
|
635 |
+
|
636 |
+
return outputs
|
hy3dgen/shapegen/pipelines.py
ADDED
@@ -0,0 +1,589 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Open Source Model Licensed under the Apache License Version 2.0
|
2 |
+
# and Other Licenses of the Third-Party Components therein:
|
3 |
+
# The below Model in this distribution may have been modified by THL A29 Limited
|
4 |
+
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
|
5 |
+
|
6 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
7 |
+
# The below software and/or models in this distribution may have been
|
8 |
+
# modified by THL A29 Limited ("Tencent Modifications").
|
9 |
+
# All Tencent Modifications are Copyright (C) THL A29 Limited.
|
10 |
+
|
11 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
12 |
+
# except for the third-party components listed below.
|
13 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
14 |
+
# in the repsective licenses of these third-party components.
|
15 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
16 |
+
# components and must ensure that the usage of the third party components adheres to
|
17 |
+
# all relevant laws and regulations.
|
18 |
+
|
19 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
20 |
+
# their software and algorithms, including trained model weights, parameters (including
|
21 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
22 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
23 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
24 |
+
|
25 |
+
import copy
|
26 |
+
import importlib
|
27 |
+
import inspect
|
28 |
+
import logging
|
29 |
+
import os
|
30 |
+
from typing import List, Optional, Union
|
31 |
+
|
32 |
+
import numpy as np
|
33 |
+
import torch
|
34 |
+
import trimesh
|
35 |
+
import yaml
|
36 |
+
from PIL import Image
|
37 |
+
from diffusers.utils.torch_utils import randn_tensor
|
38 |
+
from tqdm import tqdm
|
39 |
+
|
40 |
+
logger = logging.getLogger(__name__)
|
41 |
+
|
42 |
+
|
43 |
+
def retrieve_timesteps(
|
44 |
+
scheduler,
|
45 |
+
num_inference_steps: Optional[int] = None,
|
46 |
+
device: Optional[Union[str, torch.device]] = None,
|
47 |
+
timesteps: Optional[List[int]] = None,
|
48 |
+
sigmas: Optional[List[float]] = None,
|
49 |
+
**kwargs,
|
50 |
+
):
|
51 |
+
"""
|
52 |
+
Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
|
53 |
+
custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
|
54 |
+
|
55 |
+
Args:
|
56 |
+
scheduler (`SchedulerMixin`):
|
57 |
+
The scheduler to get timesteps from.
|
58 |
+
num_inference_steps (`int`):
|
59 |
+
The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
|
60 |
+
must be `None`.
|
61 |
+
device (`str` or `torch.device`, *optional*):
|
62 |
+
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
|
63 |
+
timesteps (`List[int]`, *optional*):
|
64 |
+
Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
|
65 |
+
`num_inference_steps` and `sigmas` must be `None`.
|
66 |
+
sigmas (`List[float]`, *optional*):
|
67 |
+
Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
|
68 |
+
`num_inference_steps` and `timesteps` must be `None`.
|
69 |
+
|
70 |
+
Returns:
|
71 |
+
`Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
|
72 |
+
second element is the number of inference steps.
|
73 |
+
"""
|
74 |
+
if timesteps is not None and sigmas is not None:
|
75 |
+
raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
|
76 |
+
if timesteps is not None:
|
77 |
+
accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
|
78 |
+
if not accepts_timesteps:
|
79 |
+
raise ValueError(
|
80 |
+
f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
|
81 |
+
f" timestep schedules. Please check whether you are using the correct scheduler."
|
82 |
+
)
|
83 |
+
scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
|
84 |
+
timesteps = scheduler.timesteps
|
85 |
+
num_inference_steps = len(timesteps)
|
86 |
+
elif sigmas is not None:
|
87 |
+
accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
|
88 |
+
if not accept_sigmas:
|
89 |
+
raise ValueError(
|
90 |
+
f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
|
91 |
+
f" sigmas schedules. Please check whether you are using the correct scheduler."
|
92 |
+
)
|
93 |
+
scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
|
94 |
+
timesteps = scheduler.timesteps
|
95 |
+
num_inference_steps = len(timesteps)
|
96 |
+
else:
|
97 |
+
scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
|
98 |
+
timesteps = scheduler.timesteps
|
99 |
+
return timesteps, num_inference_steps
|
100 |
+
|
101 |
+
|
102 |
+
def export_to_trimesh(mesh_output):
|
103 |
+
if isinstance(mesh_output, list):
|
104 |
+
outputs = []
|
105 |
+
for mesh in mesh_output:
|
106 |
+
if mesh is None:
|
107 |
+
outputs.append(None)
|
108 |
+
else:
|
109 |
+
mesh.mesh_f = mesh.mesh_f[:, ::-1]
|
110 |
+
mesh_output = trimesh.Trimesh(mesh.mesh_v, mesh.mesh_f)
|
111 |
+
outputs.append(mesh_output)
|
112 |
+
return outputs
|
113 |
+
else:
|
114 |
+
mesh_output.mesh_f = mesh_output.mesh_f[:, ::-1]
|
115 |
+
mesh_output = trimesh.Trimesh(mesh_output.mesh_v, mesh_output.mesh_f)
|
116 |
+
return mesh_output
|
117 |
+
|
118 |
+
|
119 |
+
def get_obj_from_str(string, reload=False):
|
120 |
+
module, cls = string.rsplit(".", 1)
|
121 |
+
if reload:
|
122 |
+
module_imp = importlib.import_module(module)
|
123 |
+
importlib.reload(module_imp)
|
124 |
+
return getattr(importlib.import_module(module, package=None), cls)
|
125 |
+
|
126 |
+
|
127 |
+
def instantiate_from_config(config, **kwargs):
|
128 |
+
if "target" not in config:
|
129 |
+
raise KeyError("Expected key `target` to instantiate.")
|
130 |
+
cls = get_obj_from_str(config["target"])
|
131 |
+
params = config.get("params", dict())
|
132 |
+
kwargs.update(params)
|
133 |
+
instance = cls(**kwargs)
|
134 |
+
return instance
|
135 |
+
|
136 |
+
|
137 |
+
class Hunyuan3DDiTPipeline:
|
138 |
+
@classmethod
|
139 |
+
def from_single_file(
|
140 |
+
cls,
|
141 |
+
ckpt_path,
|
142 |
+
config_path,
|
143 |
+
device='cpu',
|
144 |
+
dtype=torch.float16,
|
145 |
+
**kwargs,
|
146 |
+
):
|
147 |
+
# load config
|
148 |
+
with open(config_path, 'r') as f:
|
149 |
+
config = yaml.safe_load(f)
|
150 |
+
|
151 |
+
# load ckpt
|
152 |
+
if not os.path.exists(ckpt_path):
|
153 |
+
raise FileNotFoundError(f"Model file {ckpt_path} not found")
|
154 |
+
logger.info(f"Loading model from {ckpt_path}")
|
155 |
+
|
156 |
+
if ckpt_path.endswith('.safetensors'):
|
157 |
+
# parse safetensors
|
158 |
+
import safetensors.torch
|
159 |
+
safetensors_ckpt = safetensors.torch.load_file(ckpt_path, device='cpu')
|
160 |
+
ckpt = {}
|
161 |
+
for key, value in safetensors_ckpt.items():
|
162 |
+
model_name = key.split('.')[0]
|
163 |
+
new_key = key[len(model_name) + 1:]
|
164 |
+
if model_name not in ckpt:
|
165 |
+
ckpt[model_name] = {}
|
166 |
+
ckpt[model_name][new_key] = value
|
167 |
+
else:
|
168 |
+
ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=True)
|
169 |
+
|
170 |
+
# load model
|
171 |
+
from accelerate import init_empty_weights
|
172 |
+
with init_empty_weights():
|
173 |
+
model = instantiate_from_config(config['model'])
|
174 |
+
vae = instantiate_from_config(config['vae'])
|
175 |
+
conditioner = instantiate_from_config(config['conditioner'])
|
176 |
+
image_processor = instantiate_from_config(config['image_processor'])
|
177 |
+
scheduler = instantiate_from_config(config['scheduler'])
|
178 |
+
|
179 |
+
model.load_state_dict(ckpt['model'], assign = True)
|
180 |
+
vae.load_state_dict(ckpt['vae'], assign = True)
|
181 |
+
if 'conditioner' in ckpt:
|
182 |
+
conditioner.load_state_dict(ckpt['conditioner'], assign = True)
|
183 |
+
|
184 |
+
model_kwargs = dict(
|
185 |
+
vae=vae,
|
186 |
+
model=model,
|
187 |
+
scheduler=scheduler,
|
188 |
+
conditioner=conditioner,
|
189 |
+
image_processor=image_processor,
|
190 |
+
device=device,
|
191 |
+
dtype=dtype,
|
192 |
+
)
|
193 |
+
model_kwargs.update(kwargs)
|
194 |
+
|
195 |
+
return cls(
|
196 |
+
**model_kwargs
|
197 |
+
)
|
198 |
+
|
199 |
+
@classmethod
|
200 |
+
def from_pretrained(
|
201 |
+
cls,
|
202 |
+
model_path,
|
203 |
+
device='cuda',
|
204 |
+
dtype=torch.float16,
|
205 |
+
use_safetensors=None,
|
206 |
+
variant=None,
|
207 |
+
subfolder='hunyuan3d-dit-v2-0',
|
208 |
+
**kwargs,
|
209 |
+
):
|
210 |
+
original_model_path = model_path
|
211 |
+
if not os.path.exists(model_path):
|
212 |
+
# try local path
|
213 |
+
base_dir = os.environ.get('HY3DGEN_MODELS', '~/.cache/hy3dgen')
|
214 |
+
model_path = os.path.expanduser(os.path.join(base_dir, model_path, subfolder))
|
215 |
+
if not os.path.exists(model_path):
|
216 |
+
try:
|
217 |
+
import huggingface_hub
|
218 |
+
# download from huggingface
|
219 |
+
path = huggingface_hub.snapshot_download(repo_id=original_model_path)
|
220 |
+
model_path = os.path.join(path, subfolder)
|
221 |
+
except ImportError:
|
222 |
+
logger.warning(
|
223 |
+
"You need to install HuggingFace Hub to load models from the hub."
|
224 |
+
)
|
225 |
+
raise RuntimeError(f"Model path {model_path} not found")
|
226 |
+
if not os.path.exists(model_path):
|
227 |
+
raise FileNotFoundError(f"Model path {original_model_path} not found")
|
228 |
+
|
229 |
+
extension = 'ckpt' if not use_safetensors else 'safetensors'
|
230 |
+
variant = '' if variant is None else f'.{variant}'
|
231 |
+
ckpt_name = f'model{variant}.{extension}'
|
232 |
+
config_path = os.path.join(model_path, 'config.yaml')
|
233 |
+
ckpt_path = os.path.join(model_path, ckpt_name)
|
234 |
+
|
235 |
+
return cls.from_single_file(
|
236 |
+
ckpt_path,
|
237 |
+
config_path,
|
238 |
+
device=device,
|
239 |
+
dtype=dtype,
|
240 |
+
use_safetensors=use_safetensors,
|
241 |
+
variant=variant,
|
242 |
+
**kwargs
|
243 |
+
)
|
244 |
+
|
245 |
+
def __init__(
|
246 |
+
self,
|
247 |
+
vae,
|
248 |
+
model,
|
249 |
+
scheduler,
|
250 |
+
conditioner,
|
251 |
+
image_processor,
|
252 |
+
device='cuda',
|
253 |
+
dtype=torch.float16,
|
254 |
+
**kwargs
|
255 |
+
):
|
256 |
+
self.vae = vae
|
257 |
+
self.model = model
|
258 |
+
self.scheduler = scheduler
|
259 |
+
self.conditioner = conditioner
|
260 |
+
self.image_processor = image_processor
|
261 |
+
|
262 |
+
self.to(device, dtype)
|
263 |
+
|
264 |
+
def to(self, device=None, dtype=None):
|
265 |
+
if device is not None:
|
266 |
+
self.device = torch.device(device)
|
267 |
+
self.vae.to(device)
|
268 |
+
self.model.to(device)
|
269 |
+
self.conditioner.to(device)
|
270 |
+
if dtype is not None:
|
271 |
+
self.dtype = dtype
|
272 |
+
self.vae.to(dtype=dtype)
|
273 |
+
self.model.to(dtype=dtype)
|
274 |
+
self.conditioner.to(dtype=dtype)
|
275 |
+
|
276 |
+
def encode_cond(self, image, mask, do_classifier_free_guidance, dual_guidance):
|
277 |
+
bsz = image.shape[0]
|
278 |
+
cond = self.conditioner(image=image, mask=mask)
|
279 |
+
|
280 |
+
if do_classifier_free_guidance:
|
281 |
+
un_cond = self.conditioner.unconditional_embedding(bsz)
|
282 |
+
|
283 |
+
if dual_guidance:
|
284 |
+
un_cond_drop_main = copy.deepcopy(un_cond)
|
285 |
+
un_cond_drop_main['additional'] = cond['additional']
|
286 |
+
|
287 |
+
def cat_recursive(a, b, c):
|
288 |
+
if isinstance(a, torch.Tensor):
|
289 |
+
return torch.cat([a, b, c], dim=0).to(self.dtype)
|
290 |
+
out = {}
|
291 |
+
for k in a.keys():
|
292 |
+
out[k] = cat_recursive(a[k], b[k], c[k])
|
293 |
+
return out
|
294 |
+
|
295 |
+
cond = cat_recursive(cond, un_cond_drop_main, un_cond)
|
296 |
+
else:
|
297 |
+
un_cond = self.conditioner.unconditional_embedding(bsz)
|
298 |
+
|
299 |
+
def cat_recursive(a, b):
|
300 |
+
if isinstance(a, torch.Tensor):
|
301 |
+
return torch.cat([a, b], dim=0).to(self.dtype)
|
302 |
+
out = {}
|
303 |
+
for k in a.keys():
|
304 |
+
out[k] = cat_recursive(a[k], b[k])
|
305 |
+
return out
|
306 |
+
|
307 |
+
cond = cat_recursive(cond, un_cond)
|
308 |
+
return cond
|
309 |
+
|
310 |
+
def prepare_extra_step_kwargs(self, generator, eta):
|
311 |
+
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
|
312 |
+
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
|
313 |
+
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
|
314 |
+
# and should be between [0, 1]
|
315 |
+
|
316 |
+
accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
317 |
+
extra_step_kwargs = {}
|
318 |
+
if accepts_eta:
|
319 |
+
extra_step_kwargs["eta"] = eta
|
320 |
+
|
321 |
+
# check if the scheduler accepts generator
|
322 |
+
accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
323 |
+
if accepts_generator:
|
324 |
+
extra_step_kwargs["generator"] = generator
|
325 |
+
return extra_step_kwargs
|
326 |
+
|
327 |
+
def prepare_latents(self, batch_size, dtype, device, generator, latents=None):
|
328 |
+
shape = (batch_size, *self.vae.latent_shape)
|
329 |
+
if isinstance(generator, list) and len(generator) != batch_size:
|
330 |
+
raise ValueError(
|
331 |
+
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
332 |
+
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
333 |
+
)
|
334 |
+
|
335 |
+
if latents is None:
|
336 |
+
latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
|
337 |
+
else:
|
338 |
+
latents = latents.to(device)
|
339 |
+
|
340 |
+
# scale the initial noise by the standard deviation required by the scheduler
|
341 |
+
latents = latents * getattr(self.scheduler, 'init_noise_sigma', 1.0)
|
342 |
+
return latents
|
343 |
+
|
344 |
+
def prepare_image(self, image):
|
345 |
+
if isinstance(image, str) and not os.path.exists(image):
|
346 |
+
raise FileNotFoundError(f"Couldn't find image at path {image}")
|
347 |
+
|
348 |
+
if not isinstance(image, list):
|
349 |
+
image = [image]
|
350 |
+
image_pts = []
|
351 |
+
mask_pts = []
|
352 |
+
for img in image:
|
353 |
+
image_pt, mask_pt = self.image_processor(img, return_mask=True)
|
354 |
+
image_pts.append(image_pt)
|
355 |
+
mask_pts.append(mask_pt)
|
356 |
+
|
357 |
+
image_pts = torch.cat(image_pts, dim=0).to(self.device, dtype=self.dtype)
|
358 |
+
if mask_pts[0] is not None:
|
359 |
+
mask_pts = torch.cat(mask_pts, dim=0).to(self.device, dtype=self.dtype)
|
360 |
+
else:
|
361 |
+
mask_pts = None
|
362 |
+
return image_pts, mask_pts
|
363 |
+
|
364 |
+
def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
|
365 |
+
"""
|
366 |
+
See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
|
367 |
+
|
368 |
+
Args:
|
369 |
+
timesteps (`torch.Tensor`):
|
370 |
+
generate embedding vectors at these timesteps
|
371 |
+
embedding_dim (`int`, *optional*, defaults to 512):
|
372 |
+
dimension of the embeddings to generate
|
373 |
+
dtype:
|
374 |
+
data type of the generated embeddings
|
375 |
+
|
376 |
+
Returns:
|
377 |
+
`torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
|
378 |
+
"""
|
379 |
+
assert len(w.shape) == 1
|
380 |
+
w = w * 1000.0
|
381 |
+
|
382 |
+
half_dim = embedding_dim // 2
|
383 |
+
emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
|
384 |
+
emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
|
385 |
+
emb = w.to(dtype)[:, None] * emb[None, :]
|
386 |
+
emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
|
387 |
+
if embedding_dim % 2 == 1: # zero pad
|
388 |
+
emb = torch.nn.functional.pad(emb, (0, 1))
|
389 |
+
assert emb.shape == (w.shape[0], embedding_dim)
|
390 |
+
return emb
|
391 |
+
|
392 |
+
@torch.no_grad()
|
393 |
+
def __call__(
|
394 |
+
self,
|
395 |
+
image: Union[str, List[str], Image.Image] = None,
|
396 |
+
num_inference_steps: int = 50,
|
397 |
+
timesteps: List[int] = None,
|
398 |
+
sigmas: List[float] = None,
|
399 |
+
eta: float = 0.0,
|
400 |
+
guidance_scale: float = 7.5,
|
401 |
+
dual_guidance_scale: float = 10.5,
|
402 |
+
dual_guidance: bool = True,
|
403 |
+
generator=None,
|
404 |
+
box_v=1.01,
|
405 |
+
octree_resolution=384,
|
406 |
+
mc_level=-1 / 512,
|
407 |
+
num_chunks=8000,
|
408 |
+
mc_algo='mc',
|
409 |
+
output_type: Optional[str] = "trimesh",
|
410 |
+
enable_pbar=True,
|
411 |
+
**kwargs,
|
412 |
+
) -> List[List[trimesh.Trimesh]]:
|
413 |
+
callback = kwargs.pop("callback", None)
|
414 |
+
callback_steps = kwargs.pop("callback_steps", None)
|
415 |
+
|
416 |
+
device = self.device
|
417 |
+
dtype = self.dtype
|
418 |
+
do_classifier_free_guidance = guidance_scale >= 0 and \
|
419 |
+
getattr(self.model, 'guidance_cond_proj_dim', None) is None
|
420 |
+
dual_guidance = dual_guidance_scale >= 0 and dual_guidance
|
421 |
+
|
422 |
+
image, mask = self.prepare_image(image)
|
423 |
+
cond = self.encode_cond(image=image,
|
424 |
+
mask=mask,
|
425 |
+
do_classifier_free_guidance=do_classifier_free_guidance,
|
426 |
+
dual_guidance=dual_guidance)
|
427 |
+
batch_size = image.shape[0]
|
428 |
+
|
429 |
+
t_dtype = torch.long
|
430 |
+
timesteps, num_inference_steps = retrieve_timesteps(
|
431 |
+
self.scheduler, num_inference_steps, device, timesteps, sigmas)
|
432 |
+
|
433 |
+
latents = self.prepare_latents(batch_size, dtype, device, generator)
|
434 |
+
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
435 |
+
|
436 |
+
guidance_cond = None
|
437 |
+
if getattr(self.model, 'guidance_cond_proj_dim', None) is not None:
|
438 |
+
print('Using lcm guidance scale')
|
439 |
+
guidance_scale_tensor = torch.tensor(guidance_scale - 1).repeat(batch_size)
|
440 |
+
guidance_cond = self.get_guidance_scale_embedding(
|
441 |
+
guidance_scale_tensor, embedding_dim=self.model.guidance_cond_proj_dim
|
442 |
+
).to(device=device, dtype=latents.dtype)
|
443 |
+
|
444 |
+
for i, t in enumerate(tqdm(timesteps, disable=not enable_pbar, desc="Diffusion Sampling:", leave=False)):
|
445 |
+
# expand the latents if we are doing classifier free guidance
|
446 |
+
if do_classifier_free_guidance:
|
447 |
+
latent_model_input = torch.cat([latents] * (3 if dual_guidance else 2))
|
448 |
+
else:
|
449 |
+
latent_model_input = latents
|
450 |
+
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
451 |
+
|
452 |
+
# predict the noise residual
|
453 |
+
timestep_tensor = torch.tensor([t], dtype=t_dtype, device=device)
|
454 |
+
timestep_tensor = timestep_tensor.expand(latent_model_input.shape[0])
|
455 |
+
noise_pred = self.model(latent_model_input, timestep_tensor, cond, guidance_cond=guidance_cond)
|
456 |
+
|
457 |
+
# no drop, drop clip, all drop
|
458 |
+
if do_classifier_free_guidance:
|
459 |
+
if dual_guidance:
|
460 |
+
noise_pred_clip, noise_pred_dino, noise_pred_uncond = noise_pred.chunk(3)
|
461 |
+
noise_pred = (
|
462 |
+
noise_pred_uncond
|
463 |
+
+ guidance_scale * (noise_pred_clip - noise_pred_dino)
|
464 |
+
+ dual_guidance_scale * (noise_pred_dino - noise_pred_uncond)
|
465 |
+
)
|
466 |
+
else:
|
467 |
+
noise_pred_cond, noise_pred_uncond = noise_pred.chunk(2)
|
468 |
+
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
|
469 |
+
|
470 |
+
# compute the previous noisy sample x_t -> x_t-1
|
471 |
+
outputs = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)
|
472 |
+
latents = outputs.prev_sample
|
473 |
+
|
474 |
+
if callback is not None and i % callback_steps == 0:
|
475 |
+
step_idx = i // getattr(self.scheduler, "order", 1)
|
476 |
+
callback(step_idx, t, outputs)
|
477 |
+
|
478 |
+
return self._export(
|
479 |
+
latents,
|
480 |
+
output_type,
|
481 |
+
box_v, mc_level, num_chunks, octree_resolution, mc_algo,
|
482 |
+
)
|
483 |
+
|
484 |
+
def _export(self, latents, output_type, box_v, mc_level, num_chunks, octree_resolution, mc_algo):
|
485 |
+
if not output_type == "latent":
|
486 |
+
latents = 1. / self.vae.scale_factor * latents
|
487 |
+
latents = self.vae(latents)
|
488 |
+
outputs = self.vae.latents2mesh(
|
489 |
+
latents,
|
490 |
+
bounds=box_v,
|
491 |
+
mc_level=mc_level,
|
492 |
+
num_chunks=num_chunks,
|
493 |
+
octree_resolution=octree_resolution,
|
494 |
+
mc_algo=mc_algo,
|
495 |
+
)
|
496 |
+
else:
|
497 |
+
outputs = latents
|
498 |
+
|
499 |
+
if output_type == 'trimesh':
|
500 |
+
outputs = export_to_trimesh(outputs)
|
501 |
+
|
502 |
+
return outputs
|
503 |
+
|
504 |
+
|
505 |
+
class Hunyuan3DDiTFlowMatchingPipeline(Hunyuan3DDiTPipeline):
|
506 |
+
|
507 |
+
@torch.no_grad()
|
508 |
+
def __call__(
|
509 |
+
self,
|
510 |
+
image: Union[str, List[str], Image.Image] = None,
|
511 |
+
num_inference_steps: int = 50,
|
512 |
+
timesteps: List[int] = None,
|
513 |
+
sigmas: List[float] = None,
|
514 |
+
eta: float = 0.0,
|
515 |
+
guidance_scale: float = 7.5,
|
516 |
+
generator=None,
|
517 |
+
box_v=1.01,
|
518 |
+
octree_resolution=384,
|
519 |
+
mc_level=0.0,
|
520 |
+
mc_algo='mc',
|
521 |
+
num_chunks=8000,
|
522 |
+
output_type: Optional[str] = "trimesh",
|
523 |
+
enable_pbar=True,
|
524 |
+
**kwargs,
|
525 |
+
) -> List[List[trimesh.Trimesh]]:
|
526 |
+
callback = kwargs.pop("callback", None)
|
527 |
+
callback_steps = kwargs.pop("callback_steps", None)
|
528 |
+
|
529 |
+
device = self.device
|
530 |
+
dtype = self.dtype
|
531 |
+
do_classifier_free_guidance = guidance_scale >= 0 and not (
|
532 |
+
hasattr(self.model, 'guidance_embed') and
|
533 |
+
self.model.guidance_embed is True
|
534 |
+
)
|
535 |
+
|
536 |
+
image, mask = self.prepare_image(image)
|
537 |
+
cond = self.encode_cond(
|
538 |
+
image=image,
|
539 |
+
mask=mask,
|
540 |
+
do_classifier_free_guidance=do_classifier_free_guidance,
|
541 |
+
dual_guidance=False,
|
542 |
+
)
|
543 |
+
batch_size = image.shape[0]
|
544 |
+
|
545 |
+
# 5. Prepare timesteps
|
546 |
+
# NOTE: this is slightly different from common usage, we start from 0.
|
547 |
+
sigmas = np.linspace(0, 1, num_inference_steps) if sigmas is None else sigmas
|
548 |
+
timesteps, num_inference_steps = retrieve_timesteps(
|
549 |
+
self.scheduler,
|
550 |
+
num_inference_steps,
|
551 |
+
device,
|
552 |
+
sigmas=sigmas,
|
553 |
+
)
|
554 |
+
latents = self.prepare_latents(batch_size, dtype, device, generator)
|
555 |
+
|
556 |
+
guidance = None
|
557 |
+
if hasattr(self.model, 'guidance_embed') and \
|
558 |
+
self.model.guidance_embed is True:
|
559 |
+
guidance = torch.tensor([guidance_scale] * batch_size, device=device, dtype=dtype)
|
560 |
+
|
561 |
+
for i, t in enumerate(tqdm(timesteps, disable=not enable_pbar, desc="Diffusion Sampling:")):
|
562 |
+
# expand the latents if we are doing classifier free guidance
|
563 |
+
if do_classifier_free_guidance:
|
564 |
+
latent_model_input = torch.cat([latents] * 2)
|
565 |
+
else:
|
566 |
+
latent_model_input = latents
|
567 |
+
|
568 |
+
# NOTE: we assume model get timesteps ranged from 0 to 1
|
569 |
+
timestep = t.expand(latent_model_input.shape[0]).to(
|
570 |
+
latents.dtype) / self.scheduler.config.num_train_timesteps
|
571 |
+
noise_pred = self.model(latent_model_input, timestep, cond, guidance=guidance)
|
572 |
+
|
573 |
+
if do_classifier_free_guidance:
|
574 |
+
noise_pred_cond, noise_pred_uncond = noise_pred.chunk(2)
|
575 |
+
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
|
576 |
+
|
577 |
+
# compute the previous noisy sample x_t -> x_t-1
|
578 |
+
outputs = self.scheduler.step(noise_pred, t, latents)
|
579 |
+
latents = outputs.prev_sample
|
580 |
+
|
581 |
+
if callback is not None and i % callback_steps == 0:
|
582 |
+
step_idx = i // getattr(self.scheduler, "order", 1)
|
583 |
+
callback(step_idx, t, outputs)
|
584 |
+
|
585 |
+
return self._export(
|
586 |
+
latents,
|
587 |
+
output_type,
|
588 |
+
box_v, mc_level, num_chunks, octree_resolution, mc_algo,
|
589 |
+
)
|
hy3dgen/shapegen/postprocessors.py
ADDED
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Open Source Model Licensed under the Apache License Version 2.0
|
2 |
+
# and Other Licenses of the Third-Party Components therein:
|
3 |
+
# The below Model in this distribution may have been modified by THL A29 Limited
|
4 |
+
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
|
5 |
+
|
6 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
7 |
+
# The below software and/or models in this distribution may have been
|
8 |
+
# modified by THL A29 Limited ("Tencent Modifications").
|
9 |
+
# All Tencent Modifications are Copyright (C) THL A29 Limited.
|
10 |
+
|
11 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
12 |
+
# except for the third-party components listed below.
|
13 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
14 |
+
# in the repsective licenses of these third-party components.
|
15 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
16 |
+
# components and must ensure that the usage of the third party components adheres to
|
17 |
+
# all relevant laws and regulations.
|
18 |
+
|
19 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
20 |
+
# their software and algorithms, including trained model weights, parameters (including
|
21 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
22 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
23 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
24 |
+
|
25 |
+
import os
|
26 |
+
import tempfile
|
27 |
+
from typing import Union
|
28 |
+
|
29 |
+
import pymeshlab
|
30 |
+
import trimesh
|
31 |
+
|
32 |
+
from .models.vae import Latent2MeshOutput
|
33 |
+
|
34 |
+
|
35 |
+
def load_mesh(path):
|
36 |
+
if path.endswith(".glb"):
|
37 |
+
mesh = trimesh.load(path)
|
38 |
+
else:
|
39 |
+
mesh = pymeshlab.MeshSet()
|
40 |
+
mesh.load_new_mesh(path)
|
41 |
+
return mesh
|
42 |
+
|
43 |
+
|
44 |
+
def reduce_face(mesh: pymeshlab.MeshSet, max_facenum: int = 200000):
|
45 |
+
mesh.apply_filter(
|
46 |
+
"meshing_decimation_quadric_edge_collapse",
|
47 |
+
targetfacenum=max_facenum,
|
48 |
+
qualitythr=1.0,
|
49 |
+
preserveboundary=True,
|
50 |
+
boundaryweight=3,
|
51 |
+
preservenormal=True,
|
52 |
+
preservetopology=True,
|
53 |
+
autoclean=True
|
54 |
+
)
|
55 |
+
return mesh
|
56 |
+
|
57 |
+
|
58 |
+
def remove_floater(mesh: pymeshlab.MeshSet):
|
59 |
+
mesh.apply_filter("compute_selection_by_small_disconnected_components_per_face",
|
60 |
+
nbfaceratio=0.005)
|
61 |
+
mesh.apply_filter("compute_selection_transfer_face_to_vertex", inclusive=False)
|
62 |
+
mesh.apply_filter("meshing_remove_selected_vertices_and_faces")
|
63 |
+
return mesh
|
64 |
+
|
65 |
+
|
66 |
+
def pymeshlab2trimesh(mesh: pymeshlab.MeshSet):
|
67 |
+
temp_file = tempfile.NamedTemporaryFile(suffix='.ply', delete=True)
|
68 |
+
temp_file.close()
|
69 |
+
temp_file_name = temp_file.name
|
70 |
+
|
71 |
+
mesh.save_current_mesh(temp_file_name)
|
72 |
+
mesh = trimesh.load(temp_file_name)
|
73 |
+
if os.path.exists(temp_file_name):
|
74 |
+
os.remove(temp_file_name)
|
75 |
+
|
76 |
+
# 检查加载的对象类型
|
77 |
+
if isinstance(mesh, trimesh.Scene):
|
78 |
+
combined_mesh = trimesh.Trimesh()
|
79 |
+
# 如果是Scene,遍历所有的geometry并合并
|
80 |
+
for geom in mesh.geometry.values():
|
81 |
+
combined_mesh = trimesh.util.concatenate([combined_mesh, geom])
|
82 |
+
mesh = combined_mesh
|
83 |
+
return mesh
|
84 |
+
|
85 |
+
|
86 |
+
def trimesh2pymeshlab(mesh: trimesh.Trimesh):
|
87 |
+
temp_file = tempfile.NamedTemporaryFile(suffix='.ply', delete=True)
|
88 |
+
temp_file.close()
|
89 |
+
temp_file_name = temp_file.name
|
90 |
+
|
91 |
+
if isinstance(mesh, trimesh.scene.Scene):
|
92 |
+
for idx, obj in enumerate(mesh.geometry.values()):
|
93 |
+
if idx == 0:
|
94 |
+
temp_mesh = obj
|
95 |
+
else:
|
96 |
+
temp_mesh = temp_mesh + obj
|
97 |
+
mesh = temp_mesh
|
98 |
+
mesh.export(temp_file_name)
|
99 |
+
mesh = pymeshlab.MeshSet()
|
100 |
+
mesh.load_new_mesh(temp_file_name)
|
101 |
+
if os.path.exists(temp_file_name):
|
102 |
+
os.remove(temp_file_name)
|
103 |
+
|
104 |
+
return mesh
|
105 |
+
|
106 |
+
|
107 |
+
def export_mesh(input, output):
|
108 |
+
if isinstance(input, pymeshlab.MeshSet):
|
109 |
+
mesh = output
|
110 |
+
elif isinstance(input, Latent2MeshOutput):
|
111 |
+
output = Latent2MeshOutput()
|
112 |
+
output.mesh_v = output.current_mesh().vertex_matrix()
|
113 |
+
output.mesh_f = output.current_mesh().face_matrix()
|
114 |
+
mesh = output
|
115 |
+
else:
|
116 |
+
mesh = pymeshlab2trimesh(output)
|
117 |
+
return mesh
|
118 |
+
|
119 |
+
|
120 |
+
def import_mesh(mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str]) -> pymeshlab.MeshSet:
|
121 |
+
if isinstance(mesh, str):
|
122 |
+
mesh = load_mesh(mesh)
|
123 |
+
elif isinstance(mesh, Latent2MeshOutput):
|
124 |
+
mesh = pymeshlab.MeshSet()
|
125 |
+
mesh_pymeshlab = pymeshlab.Mesh(vertex_matrix=mesh.mesh_v, face_matrix=mesh.mesh_f)
|
126 |
+
mesh.add_mesh(mesh_pymeshlab, "converted_mesh")
|
127 |
+
|
128 |
+
if isinstance(mesh, (trimesh.Trimesh, trimesh.scene.Scene)):
|
129 |
+
mesh = trimesh2pymeshlab(mesh)
|
130 |
+
|
131 |
+
return mesh
|
132 |
+
|
133 |
+
|
134 |
+
class FaceReducer:
|
135 |
+
def __call__(
|
136 |
+
self,
|
137 |
+
mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str],
|
138 |
+
max_facenum: int = 40000
|
139 |
+
) -> Union[pymeshlab.MeshSet, trimesh.Trimesh]:
|
140 |
+
ms = import_mesh(mesh)
|
141 |
+
ms = reduce_face(ms, max_facenum=max_facenum)
|
142 |
+
mesh = export_mesh(mesh, ms)
|
143 |
+
return mesh
|
144 |
+
|
145 |
+
|
146 |
+
class FloaterRemover:
|
147 |
+
def __call__(
|
148 |
+
self,
|
149 |
+
mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str],
|
150 |
+
) -> Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput]:
|
151 |
+
ms = import_mesh(mesh)
|
152 |
+
ms = remove_floater(ms)
|
153 |
+
mesh = export_mesh(mesh, ms)
|
154 |
+
return mesh
|
155 |
+
|
156 |
+
|
157 |
+
class DegenerateFaceRemover:
|
158 |
+
def __call__(
|
159 |
+
self,
|
160 |
+
mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str],
|
161 |
+
) -> Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput]:
|
162 |
+
ms = import_mesh(mesh)
|
163 |
+
|
164 |
+
temp_file = tempfile.NamedTemporaryFile(suffix='.ply', delete=True)
|
165 |
+
temp_file.close()
|
166 |
+
temp_file_name = temp_file.name
|
167 |
+
|
168 |
+
ms.save_current_mesh(temp_file_name)
|
169 |
+
ms = pymeshlab.MeshSet()
|
170 |
+
ms.load_new_mesh(temp_file_name)
|
171 |
+
if os.path.exists(temp_file_name):
|
172 |
+
os.remove(temp_file_name)
|
173 |
+
|
174 |
+
mesh = export_mesh(mesh, ms)
|
175 |
+
return mesh
|
hy3dgen/shapegen/preprocessors.py
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Open Source Model Licensed under the Apache License Version 2.0
|
2 |
+
# and Other Licenses of the Third-Party Components therein:
|
3 |
+
# The below Model in this distribution may have been modified by THL A29 Limited
|
4 |
+
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
|
5 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
6 |
+
# The below software and/or models in this distribution may have been
|
7 |
+
# modified by THL A29 Limited ("Tencent Modifications").
|
8 |
+
# All Tencent Modifications are Copyright (C) THL A29 Limited.
|
9 |
+
|
10 |
+
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
11 |
+
# except for the third-party components listed below.
|
12 |
+
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
13 |
+
# in the repsective licenses of these third-party components.
|
14 |
+
# Users must comply with all terms and conditions of original licenses of these third-party
|
15 |
+
# components and must ensure that the usage of the third party components adheres to
|
16 |
+
# all relevant laws and regulations.
|
17 |
+
|
18 |
+
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
19 |
+
# their software and algorithms, including trained model weights, parameters (including
|
20 |
+
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
21 |
+
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
22 |
+
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
23 |
+
|
24 |
+
import cv2
|
25 |
+
import numpy as np
|
26 |
+
import torch
|
27 |
+
from PIL import Image
|
28 |
+
from einops import repeat, rearrange
|
29 |
+
|
30 |
+
|
31 |
+
def array_to_tensor(np_array):
|
32 |
+
image_pt = torch.tensor(np_array).float()
|
33 |
+
image_pt = image_pt / 255 * 2 - 1
|
34 |
+
image_pt = rearrange(image_pt, "h w c -> c h w")
|
35 |
+
image_pts = repeat(image_pt, "c h w -> b c h w", b=1)
|
36 |
+
return image_pts
|
37 |
+
|
38 |
+
|
39 |
+
class ImageProcessorV2:
|
40 |
+
def __init__(self, size=512, border_ratio=None):
|
41 |
+
self.size = size
|
42 |
+
self.border_ratio = border_ratio
|
43 |
+
|
44 |
+
@staticmethod
|
45 |
+
def recenter(image, border_ratio: float = 0.2):
|
46 |
+
""" recenter an image to leave some empty space at the image border.
|
47 |
+
|
48 |
+
Args:
|
49 |
+
image (ndarray): input image, float/uint8 [H, W, 3/4]
|
50 |
+
mask (ndarray): alpha mask, bool [H, W]
|
51 |
+
border_ratio (float, optional): border ratio, image will be resized to (1 - border_ratio). Defaults to 0.2.
|
52 |
+
|
53 |
+
Returns:
|
54 |
+
ndarray: output image, float/uint8 [H, W, 3/4]
|
55 |
+
"""
|
56 |
+
|
57 |
+
if image.shape[-1] == 4:
|
58 |
+
mask = image[..., 3]
|
59 |
+
else:
|
60 |
+
mask = np.ones_like(image[..., 0:1]) * 255
|
61 |
+
image = np.concatenate([image, mask], axis=-1)
|
62 |
+
mask = mask[..., 0]
|
63 |
+
|
64 |
+
H, W, C = image.shape
|
65 |
+
|
66 |
+
size = max(H, W)
|
67 |
+
result = np.zeros((size, size, C), dtype=np.uint8)
|
68 |
+
|
69 |
+
coords = np.nonzero(mask)
|
70 |
+
x_min, x_max = coords[0].min(), coords[0].max()
|
71 |
+
y_min, y_max = coords[1].min(), coords[1].max()
|
72 |
+
h = x_max - x_min
|
73 |
+
w = y_max - y_min
|
74 |
+
if h == 0 or w == 0:
|
75 |
+
raise ValueError('input image is empty')
|
76 |
+
desired_size = int(size * (1 - border_ratio))
|
77 |
+
scale = desired_size / max(h, w)
|
78 |
+
h2 = int(h * scale)
|
79 |
+
w2 = int(w * scale)
|
80 |
+
x2_min = (size - h2) // 2
|
81 |
+
x2_max = x2_min + h2
|
82 |
+
|
83 |
+
y2_min = (size - w2) // 2
|
84 |
+
y2_max = y2_min + w2
|
85 |
+
|
86 |
+
result[x2_min:x2_max, y2_min:y2_max] = cv2.resize(image[x_min:x_max, y_min:y_max], (w2, h2),
|
87 |
+
interpolation=cv2.INTER_AREA)
|
88 |
+
|
89 |
+
bg = np.ones((result.shape[0], result.shape[1], 3), dtype=np.uint8) * 255
|
90 |
+
# bg = np.zeros((result.shape[0], result.shape[1], 3), dtype=np.uint8) * 255
|
91 |
+
mask = result[..., 3:].astype(np.float32) / 255
|
92 |
+
result = result[..., :3] * mask + bg * (1 - mask)
|
93 |
+
|
94 |
+
mask = mask * 255
|
95 |
+
result = result.clip(0, 255).astype(np.uint8)
|
96 |
+
mask = mask.clip(0, 255).astype(np.uint8)
|
97 |
+
return result, mask
|
98 |
+
|
99 |
+
def __call__(self, image, border_ratio=0.15, to_tensor=True, return_mask=False, **kwargs):
|
100 |
+
if self.border_ratio is not None:
|
101 |
+
border_ratio = self.border_ratio
|
102 |
+
print(f"Using border_ratio from init: {border_ratio}")
|
103 |
+
if isinstance(image, str):
|
104 |
+
image = cv2.imread(image, cv2.IMREAD_UNCHANGED)
|
105 |
+
image, mask = self.recenter(image, border_ratio=border_ratio)
|
106 |
+
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
107 |
+
elif isinstance(image, Image.Image):
|
108 |
+
image = np.asarray(image)
|
109 |
+
image, mask = self.recenter(image, border_ratio=border_ratio)
|
110 |
+
|
111 |
+
image = cv2.resize(image, (self.size, self.size), interpolation=cv2.INTER_CUBIC)
|
112 |
+
mask = cv2.resize(mask, (self.size, self.size), interpolation=cv2.INTER_NEAREST)
|
113 |
+
mask = mask[..., np.newaxis]
|
114 |
+
|
115 |
+
if to_tensor:
|
116 |
+
image = array_to_tensor(image)
|
117 |
+
mask = array_to_tensor(mask)
|
118 |
+
if return_mask:
|
119 |
+
return image, mask
|
120 |
+
return image
|
121 |
+
|
122 |
+
|
123 |
+
IMAGE_PROCESSORS = {
|
124 |
+
"v2": ImageProcessorV2,
|
125 |
+
}
|
126 |
+
|
127 |
+
DEFAULT_IMAGEPROCESSOR = 'v2'
|