syedMohib44 commited on
Commit
00e5927
·
1 Parent(s): f4ae690
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -35
  2. .gitignore +28 -0
  3. README.md +15 -5
  4. assets/shoes.png +0 -0
  5. build/lib/hy3dgen/__init__.py +23 -0
  6. build/lib/hy3dgen/rembg.py +36 -0
  7. build/lib/hy3dgen/shapegen/__init__.py +27 -0
  8. build/lib/hy3dgen/shapegen/models/__init__.py +28 -0
  9. build/lib/hy3dgen/shapegen/models/conditioner.py +165 -0
  10. build/lib/hy3dgen/shapegen/models/hunyuan3ddit.py +390 -0
  11. build/lib/hy3dgen/shapegen/models/vae.py +636 -0
  12. build/lib/hy3dgen/shapegen/pipelines.py +589 -0
  13. build/lib/hy3dgen/shapegen/postprocessors.py +175 -0
  14. build/lib/hy3dgen/shapegen/preprocessors.py +127 -0
  15. build/lib/hy3dgen/shapegen/schedulers.py +307 -0
  16. build/lib/hy3dgen/texgen/__init__.py +26 -0
  17. build/lib/hy3dgen/texgen/differentiable_renderer/__init__.py +23 -0
  18. build/lib/hy3dgen/texgen/differentiable_renderer/camera_utils.py +116 -0
  19. build/lib/hy3dgen/texgen/differentiable_renderer/mesh_processor.py +70 -0
  20. build/lib/hy3dgen/texgen/differentiable_renderer/mesh_render.py +833 -0
  21. build/lib/hy3dgen/texgen/differentiable_renderer/mesh_utils.py +44 -0
  22. build/lib/hy3dgen/texgen/differentiable_renderer/setup.py +48 -0
  23. build/lib/hy3dgen/texgen/hunyuanpaint/__init__.py +23 -0
  24. build/lib/hy3dgen/texgen/hunyuanpaint/pipeline.py +554 -0
  25. build/lib/hy3dgen/texgen/hunyuanpaint/unet/__init__.py +23 -0
  26. build/lib/hy3dgen/texgen/hunyuanpaint/unet/modules.py +440 -0
  27. build/lib/hy3dgen/texgen/pipelines.py +227 -0
  28. build/lib/hy3dgen/texgen/utils/__init__.py +23 -0
  29. build/lib/hy3dgen/texgen/utils/alignImg4Tex_utils.py +132 -0
  30. build/lib/hy3dgen/texgen/utils/counter_utils.py +58 -0
  31. build/lib/hy3dgen/texgen/utils/dehighlight_utils.py +84 -0
  32. build/lib/hy3dgen/texgen/utils/multiview_utils.py +86 -0
  33. build/lib/hy3dgen/texgen/utils/simplify_mesh_utils.py +46 -0
  34. build/lib/hy3dgen/texgen/utils/uv_warp_utils.py +42 -0
  35. build/lib/hy3dgen/text2image.py +93 -0
  36. dist/hy3dgen-2.0.0-py3.12.egg +0 -0
  37. hy3dgen.egg-info/PKG-INFO +3 -0
  38. hy3dgen.egg-info/SOURCES.txt +37 -0
  39. hy3dgen.egg-info/dependency_links.txt +1 -0
  40. hy3dgen.egg-info/top_level.txt +1 -0
  41. hy3dgen/__init__.py +23 -0
  42. hy3dgen/rembg.py +36 -0
  43. hy3dgen/shapegen/__init__.py +27 -0
  44. hy3dgen/shapegen/models/__init__.py +28 -0
  45. hy3dgen/shapegen/models/conditioner.py +165 -0
  46. hy3dgen/shapegen/models/hunyuan3ddit.py +390 -0
  47. hy3dgen/shapegen/models/vae.py +636 -0
  48. hy3dgen/shapegen/pipelines.py +589 -0
  49. hy3dgen/shapegen/postprocessors.py +175 -0
  50. hy3dgen/shapegen/preprocessors.py +127 -0
.gitattributes CHANGED
@@ -1,35 +1 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.obj filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.so
5
+
6
+ # Virtual environment
7
+ venv/
8
+ env/
9
+ .venv/
10
+
11
+ # Jupyter Notebook checkpoints
12
+ .ipynb_checkpoints/
13
+
14
+ # Logs and local environment files
15
+ *.log
16
+ *.env
17
+ .env.local
18
+
19
+ # PyTorch or TensorFlow saved models
20
+ *.pt
21
+ *.pth
22
+ *.h5
23
+
24
+ # VSCode settings (if using VSCode)
25
+ .vscode/
26
+
27
+ # Hugging Face cache (optional)
28
+ ~/.cache/huggingface/
README.md CHANGED
@@ -1,14 +1,24 @@
1
  ---
2
- title: Ditto Api
3
- emoji: 🌖
4
- colorFrom: blue
5
- colorTo: green
6
  sdk: gradio
7
  sdk_version: 5.17.1
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
- short_description: Api to generate 3D object out of an image
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Ditto
3
+ emoji: 🐢
4
+ colorFrom: yellow
5
+ colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 5.17.1
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
+ short_description: Image to 3D object generator
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
15
+
16
+ # Setup
17
+
18
+ ```
19
+ uv pip compile requirements.txt -o requirements-uv.txt --index-strategy unsafe-best-match --no-build-isolation -p 3.10
20
+
21
+ pip install -r requirements.txt
22
+
23
+ python setup.py install
24
+ ```
assets/shoes.png ADDED
build/lib/hy3dgen/__init__.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source Model Licensed under the Apache License Version 2.0
2
+ # and Other Licenses of the Third-Party Components therein:
3
+ # The below Model in this distribution may have been modified by THL A29 Limited
4
+ # ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
5
+
6
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
7
+ # The below software and/or models in this distribution may have been
8
+ # modified by THL A29 Limited ("Tencent Modifications").
9
+ # All Tencent Modifications are Copyright (C) THL A29 Limited.
10
+
11
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
12
+ # except for the third-party components listed below.
13
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
14
+ # in the repsective licenses of these third-party components.
15
+ # Users must comply with all terms and conditions of original licenses of these third-party
16
+ # components and must ensure that the usage of the third party components adheres to
17
+ # all relevant laws and regulations.
18
+
19
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
20
+ # their software and algorithms, including trained model weights, parameters (including
21
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
22
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
23
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
build/lib/hy3dgen/rembg.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source Model Licensed under the Apache License Version 2.0
2
+ # and Other Licenses of the Third-Party Components therein:
3
+ # The below Model in this distribution may have been modified by THL A29 Limited
4
+ # ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
5
+
6
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
7
+ # The below software and/or models in this distribution may have been
8
+ # modified by THL A29 Limited ("Tencent Modifications").
9
+ # All Tencent Modifications are Copyright (C) THL A29 Limited.
10
+
11
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
12
+ # except for the third-party components listed below.
13
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
14
+ # in the repsective licenses of these third-party components.
15
+ # Users must comply with all terms and conditions of original licenses of these third-party
16
+ # components and must ensure that the usage of the third party components adheres to
17
+ # all relevant laws and regulations.
18
+
19
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
20
+ # their software and algorithms, including trained model weights, parameters (including
21
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
22
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
23
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
24
+
25
+
26
+ from PIL import Image
27
+ from rembg import remove, new_session
28
+
29
+
30
+ class BackgroundRemover():
31
+ def __init__(self):
32
+ self.session = new_session()
33
+
34
+ def __call__(self, image: Image.Image):
35
+ output = remove(image, session=self.session, bgcolor=[255, 255, 255, 0])
36
+ return output
build/lib/hy3dgen/shapegen/__init__.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source Model Licensed under the Apache License Version 2.0
2
+ # and Other Licenses of the Third-Party Components therein:
3
+ # The below Model in this distribution may have been modified by THL A29 Limited
4
+ # ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
5
+
6
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
7
+ # The below software and/or models in this distribution may have been
8
+ # modified by THL A29 Limited ("Tencent Modifications").
9
+ # All Tencent Modifications are Copyright (C) THL A29 Limited.
10
+
11
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
12
+ # except for the third-party components listed below.
13
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
14
+ # in the repsective licenses of these third-party components.
15
+ # Users must comply with all terms and conditions of original licenses of these third-party
16
+ # components and must ensure that the usage of the third party components adheres to
17
+ # all relevant laws and regulations.
18
+
19
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
20
+ # their software and algorithms, including trained model weights, parameters (including
21
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
22
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
23
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
24
+
25
+ from .pipelines import Hunyuan3DDiTPipeline, Hunyuan3DDiTFlowMatchingPipeline
26
+ from .postprocessors import FaceReducer, FloaterRemover, DegenerateFaceRemover
27
+ from .preprocessors import ImageProcessorV2, IMAGE_PROCESSORS, DEFAULT_IMAGEPROCESSOR
build/lib/hy3dgen/shapegen/models/__init__.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source Model Licensed under the Apache License Version 2.0
2
+ # and Other Licenses of the Third-Party Components therein:
3
+ # The below Model in this distribution may have been modified by THL A29 Limited
4
+ # ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
5
+
6
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
7
+ # The below software and/or models in this distribution may have been
8
+ # modified by THL A29 Limited ("Tencent Modifications").
9
+ # All Tencent Modifications are Copyright (C) THL A29 Limited.
10
+
11
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
12
+ # except for the third-party components listed below.
13
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
14
+ # in the repsective licenses of these third-party components.
15
+ # Users must comply with all terms and conditions of original licenses of these third-party
16
+ # components and must ensure that the usage of the third party components adheres to
17
+ # all relevant laws and regulations.
18
+
19
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
20
+ # their software and algorithms, including trained model weights, parameters (including
21
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
22
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
23
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
24
+
25
+
26
+ from .conditioner import DualImageEncoder, SingleImageEncoder, DinoImageEncoder, CLIPImageEncoder
27
+ from .hunyuan3ddit import Hunyuan3DDiT
28
+ from .vae import ShapeVAE
build/lib/hy3dgen/shapegen/models/conditioner.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source Model Licensed under the Apache License Version 2.0
2
+ # and Other Licenses of the Third-Party Components therein:
3
+ # The below Model in this distribution may have been modified by THL A29 Limited
4
+ # ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
5
+
6
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
7
+ # The below software and/or models in this distribution may have been
8
+ # modified by THL A29 Limited ("Tencent Modifications").
9
+ # All Tencent Modifications are Copyright (C) THL A29 Limited.
10
+
11
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
12
+ # except for the third-party components listed below.
13
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
14
+ # in the repsective licenses of these third-party components.
15
+ # Users must comply with all terms and conditions of original licenses of these third-party
16
+ # components and must ensure that the usage of the third party components adheres to
17
+ # all relevant laws and regulations.
18
+
19
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
20
+ # their software and algorithms, including trained model weights, parameters (including
21
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
22
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
23
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
24
+
25
+ import torch
26
+ import torch.nn as nn
27
+ from torchvision import transforms
28
+ from transformers import (
29
+ CLIPVisionModelWithProjection,
30
+ CLIPVisionConfig,
31
+ Dinov2Model,
32
+ Dinov2Config,
33
+ )
34
+
35
+
36
+ class ImageEncoder(nn.Module):
37
+ def __init__(
38
+ self,
39
+ version=None,
40
+ config=None,
41
+ use_cls_token=True,
42
+ image_size=224,
43
+ **kwargs,
44
+ ):
45
+ super().__init__()
46
+
47
+ if config is None:
48
+ self.model = self.MODEL_CLASS.from_pretrained(version)
49
+ else:
50
+ self.model = self.MODEL_CLASS(self.MODEL_CONFIG_CLASS.from_dict(config))
51
+ self.model.eval()
52
+ self.model.requires_grad_(False)
53
+ self.use_cls_token = use_cls_token
54
+ self.size = image_size // 14
55
+ self.num_patches = (image_size // 14) ** 2
56
+ if self.use_cls_token:
57
+ self.num_patches += 1
58
+
59
+ self.transform = transforms.Compose(
60
+ [
61
+ transforms.Resize(image_size, transforms.InterpolationMode.BILINEAR, antialias=True),
62
+ transforms.CenterCrop(image_size),
63
+ transforms.Normalize(
64
+ mean=self.mean,
65
+ std=self.std,
66
+ ),
67
+ ]
68
+ )
69
+
70
+ def forward(self, image, mask=None, value_range=(-1, 1)):
71
+ if value_range is not None:
72
+ low, high = value_range
73
+ image = (image - low) / (high - low)
74
+
75
+ image = image.to(self.model.device, dtype=self.model.dtype)
76
+ inputs = self.transform(image)
77
+ outputs = self.model(inputs)
78
+
79
+ last_hidden_state = outputs.last_hidden_state
80
+ if not self.use_cls_token:
81
+ last_hidden_state = last_hidden_state[:, 1:, :]
82
+
83
+ return last_hidden_state
84
+
85
+ def unconditional_embedding(self, batch_size):
86
+ device = next(self.model.parameters()).device
87
+ dtype = next(self.model.parameters()).dtype
88
+ zero = torch.zeros(
89
+ batch_size,
90
+ self.num_patches,
91
+ self.model.config.hidden_size,
92
+ device=device,
93
+ dtype=dtype,
94
+ )
95
+
96
+ return zero
97
+
98
+
99
+ class CLIPImageEncoder(ImageEncoder):
100
+ MODEL_CLASS = CLIPVisionModelWithProjection
101
+ MODEL_CONFIG_CLASS = CLIPVisionConfig
102
+ mean = [0.48145466, 0.4578275, 0.40821073]
103
+ std = [0.26862954, 0.26130258, 0.27577711]
104
+
105
+
106
+ class DinoImageEncoder(ImageEncoder):
107
+ MODEL_CLASS = Dinov2Model
108
+ MODEL_CONFIG_CLASS = Dinov2Config
109
+ mean = [0.485, 0.456, 0.406]
110
+ std = [0.229, 0.224, 0.225]
111
+
112
+
113
+ def build_image_encoder(config):
114
+ if config['type'] == 'CLIPImageEncoder':
115
+ return CLIPImageEncoder(**config['kwargs'])
116
+ elif config['type'] == 'DinoImageEncoder':
117
+ return DinoImageEncoder(**config['kwargs'])
118
+ else:
119
+ raise ValueError(f'Unknown image encoder type: {config["type"]}')
120
+
121
+
122
+ class DualImageEncoder(nn.Module):
123
+ def __init__(
124
+ self,
125
+ main_image_encoder,
126
+ additional_image_encoder,
127
+ ):
128
+ super().__init__()
129
+ self.main_image_encoder = build_image_encoder(main_image_encoder)
130
+ self.additional_image_encoder = build_image_encoder(additional_image_encoder)
131
+
132
+ def forward(self, image, mask=None):
133
+ outputs = {
134
+ 'main': self.main_image_encoder(image, mask=mask),
135
+ 'additional': self.additional_image_encoder(image, mask=mask),
136
+ }
137
+ return outputs
138
+
139
+ def unconditional_embedding(self, batch_size):
140
+ outputs = {
141
+ 'main': self.main_image_encoder.unconditional_embedding(batch_size),
142
+ 'additional': self.additional_image_encoder.unconditional_embedding(batch_size),
143
+ }
144
+ return outputs
145
+
146
+
147
+ class SingleImageEncoder(nn.Module):
148
+ def __init__(
149
+ self,
150
+ main_image_encoder,
151
+ ):
152
+ super().__init__()
153
+ self.main_image_encoder = build_image_encoder(main_image_encoder)
154
+
155
+ def forward(self, image, mask=None):
156
+ outputs = {
157
+ 'main': self.main_image_encoder(image, mask=mask),
158
+ }
159
+ return outputs
160
+
161
+ def unconditional_embedding(self, batch_size):
162
+ outputs = {
163
+ 'main': self.main_image_encoder.unconditional_embedding(batch_size),
164
+ }
165
+ return outputs
build/lib/hy3dgen/shapegen/models/hunyuan3ddit.py ADDED
@@ -0,0 +1,390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source Model Licensed under the Apache License Version 2.0
2
+ # and Other Licenses of the Third-Party Components therein:
3
+ # The below Model in this distribution may have been modified by THL A29 Limited
4
+ # ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
5
+
6
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
7
+ # The below software and/or models in this distribution may have been
8
+ # modified by THL A29 Limited ("Tencent Modifications").
9
+ # All Tencent Modifications are Copyright (C) THL A29 Limited.
10
+
11
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
12
+ # except for the third-party components listed below.
13
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
14
+ # in the repsective licenses of these third-party components.
15
+ # Users must comply with all terms and conditions of original licenses of these third-party
16
+ # components and must ensure that the usage of the third party components adheres to
17
+ # all relevant laws and regulations.
18
+
19
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
20
+ # their software and algorithms, including trained model weights, parameters (including
21
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
22
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
23
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
24
+
25
+ import math
26
+ from dataclasses import dataclass
27
+ from typing import List, Tuple, Optional
28
+
29
+ import torch
30
+ from einops import rearrange
31
+ from torch import Tensor, nn
32
+
33
+
34
+ def attention(q: Tensor, k: Tensor, v: Tensor, **kwargs) -> Tensor:
35
+ x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
36
+ x = rearrange(x, "B H L D -> B L (H D)")
37
+ return x
38
+
39
+
40
+ def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 1000.0):
41
+ """
42
+ Create sinusoidal timestep embeddings.
43
+ :param t: a 1-D Tensor of N indices, one per batch element.
44
+ These may be fractional.
45
+ :param dim: the dimension of the output.
46
+ :param max_period: controls the minimum frequency of the embeddings.
47
+ :return: an (N, D) Tensor of positional embeddings.
48
+ """
49
+ t = time_factor * t
50
+ half = dim // 2
51
+ freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(
52
+ t.device
53
+ )
54
+
55
+ args = t[:, None].float() * freqs[None]
56
+ embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
57
+ if dim % 2:
58
+ embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
59
+ if torch.is_floating_point(t):
60
+ embedding = embedding.to(t)
61
+ return embedding
62
+
63
+
64
+ class MLPEmbedder(nn.Module):
65
+ def __init__(self, in_dim: int, hidden_dim: int):
66
+ super().__init__()
67
+ self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
68
+ self.silu = nn.SiLU()
69
+ self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
70
+
71
+ def forward(self, x: Tensor) -> Tensor:
72
+ return self.out_layer(self.silu(self.in_layer(x)))
73
+
74
+
75
+ class RMSNorm(torch.nn.Module):
76
+ def __init__(self, dim: int):
77
+ super().__init__()
78
+ self.scale = nn.Parameter(torch.ones(dim))
79
+
80
+ def forward(self, x: Tensor):
81
+ x_dtype = x.dtype
82
+ x = x.float()
83
+ rrms = torch.rsqrt(torch.mean(x ** 2, dim=-1, keepdim=True) + 1e-6)
84
+ return (x * rrms).to(dtype=x_dtype) * self.scale
85
+
86
+
87
+ class QKNorm(torch.nn.Module):
88
+ def __init__(self, dim: int):
89
+ super().__init__()
90
+ self.query_norm = RMSNorm(dim)
91
+ self.key_norm = RMSNorm(dim)
92
+
93
+ def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tuple[Tensor, Tensor]:
94
+ q = self.query_norm(q)
95
+ k = self.key_norm(k)
96
+ return q.to(v), k.to(v)
97
+
98
+
99
+ class SelfAttention(nn.Module):
100
+ def __init__(
101
+ self,
102
+ dim: int,
103
+ num_heads: int = 8,
104
+ qkv_bias: bool = False,
105
+ ):
106
+ super().__init__()
107
+ self.num_heads = num_heads
108
+ head_dim = dim // num_heads
109
+
110
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
111
+ self.norm = QKNorm(head_dim)
112
+ self.proj = nn.Linear(dim, dim)
113
+
114
+ def forward(self, x: Tensor, pe: Tensor) -> Tensor:
115
+ qkv = self.qkv(x)
116
+ q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
117
+ q, k = self.norm(q, k, v)
118
+ x = attention(q, k, v, pe=pe)
119
+ x = self.proj(x)
120
+ return x
121
+
122
+
123
+ @dataclass
124
+ class ModulationOut:
125
+ shift: Tensor
126
+ scale: Tensor
127
+ gate: Tensor
128
+
129
+
130
+ class Modulation(nn.Module):
131
+ def __init__(self, dim: int, double: bool):
132
+ super().__init__()
133
+ self.is_double = double
134
+ self.multiplier = 6 if double else 3
135
+ self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)
136
+
137
+ def forward(self, vec: Tensor) -> Tuple[ModulationOut, Optional[ModulationOut]]:
138
+ out = self.lin(nn.functional.silu(vec))[:, None, :]
139
+ out = out.chunk(self.multiplier, dim=-1)
140
+
141
+ return (
142
+ ModulationOut(*out[:3]),
143
+ ModulationOut(*out[3:]) if self.is_double else None,
144
+ )
145
+
146
+
147
+ class DoubleStreamBlock(nn.Module):
148
+ def __init__(
149
+ self,
150
+ hidden_size: int,
151
+ num_heads: int,
152
+ mlp_ratio: float,
153
+ qkv_bias: bool = False,
154
+ ):
155
+ super().__init__()
156
+ mlp_hidden_dim = int(hidden_size * mlp_ratio)
157
+ self.num_heads = num_heads
158
+ self.hidden_size = hidden_size
159
+ self.img_mod = Modulation(hidden_size, double=True)
160
+ self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
161
+ self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
162
+
163
+ self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
164
+ self.img_mlp = nn.Sequential(
165
+ nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
166
+ nn.GELU(approximate="tanh"),
167
+ nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
168
+ )
169
+
170
+ self.txt_mod = Modulation(hidden_size, double=True)
171
+ self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
172
+ self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
173
+
174
+ self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
175
+ self.txt_mlp = nn.Sequential(
176
+ nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
177
+ nn.GELU(approximate="tanh"),
178
+ nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
179
+ )
180
+
181
+ def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor) -> Tuple[Tensor, Tensor]:
182
+ img_mod1, img_mod2 = self.img_mod(vec)
183
+ txt_mod1, txt_mod2 = self.txt_mod(vec)
184
+
185
+ img_modulated = self.img_norm1(img)
186
+ img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
187
+ img_qkv = self.img_attn.qkv(img_modulated)
188
+ img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
189
+ img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
190
+
191
+ txt_modulated = self.txt_norm1(txt)
192
+ txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
193
+ txt_qkv = self.txt_attn.qkv(txt_modulated)
194
+ txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
195
+ txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
196
+
197
+ q = torch.cat((txt_q, img_q), dim=2)
198
+ k = torch.cat((txt_k, img_k), dim=2)
199
+ v = torch.cat((txt_v, img_v), dim=2)
200
+
201
+ attn = attention(q, k, v, pe=pe)
202
+ txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1]:]
203
+
204
+ img = img + img_mod1.gate * self.img_attn.proj(img_attn)
205
+ img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)
206
+
207
+ txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
208
+ txt = txt + txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
209
+ return img, txt
210
+
211
+
212
+ class SingleStreamBlock(nn.Module):
213
+ """
214
+ A DiT block with parallel linear layers as described in
215
+ https://arxiv.org/abs/2302.05442 and adapted modulation interface.
216
+ """
217
+
218
+ def __init__(
219
+ self,
220
+ hidden_size: int,
221
+ num_heads: int,
222
+ mlp_ratio: float = 4.0,
223
+ qk_scale: Optional[float] = None,
224
+ ):
225
+ super().__init__()
226
+
227
+ self.hidden_dim = hidden_size
228
+ self.num_heads = num_heads
229
+ head_dim = hidden_size // num_heads
230
+ self.scale = qk_scale or head_dim ** -0.5
231
+
232
+ self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
233
+ # qkv and mlp_in
234
+ self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim)
235
+ # proj and mlp_out
236
+ self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
237
+
238
+ self.norm = QKNorm(head_dim)
239
+
240
+ self.hidden_size = hidden_size
241
+ self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
242
+
243
+ self.mlp_act = nn.GELU(approximate="tanh")
244
+ self.modulation = Modulation(hidden_size, double=False)
245
+
246
+ def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
247
+ mod, _ = self.modulation(vec)
248
+
249
+ x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
250
+ qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
251
+
252
+ q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
253
+ q, k = self.norm(q, k, v)
254
+
255
+ # compute attention
256
+ attn = attention(q, k, v, pe=pe)
257
+ # compute activation in mlp stream, cat again and run second linear layer
258
+ output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
259
+ return x + mod.gate * output
260
+
261
+
262
+ class LastLayer(nn.Module):
263
+ def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
264
+ super().__init__()
265
+ self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
266
+ self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
267
+ self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True))
268
+
269
+ def forward(self, x: Tensor, vec: Tensor) -> Tensor:
270
+ shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
271
+ x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
272
+ x = self.linear(x)
273
+ return x
274
+
275
+
276
+ class Hunyuan3DDiT(nn.Module):
277
+ def __init__(
278
+ self,
279
+ in_channels: int = 64,
280
+ context_in_dim: int = 1536,
281
+ hidden_size: int = 1024,
282
+ mlp_ratio: float = 4.0,
283
+ num_heads: int = 16,
284
+ depth: int = 16,
285
+ depth_single_blocks: int = 32,
286
+ axes_dim: List[int] = [64],
287
+ theta: int = 10_000,
288
+ qkv_bias: bool = True,
289
+ time_factor: float = 1000,
290
+ ckpt_path: Optional[str] = None,
291
+ **kwargs,
292
+ ):
293
+ super().__init__()
294
+ self.in_channels = in_channels
295
+ self.context_in_dim = context_in_dim
296
+ self.hidden_size = hidden_size
297
+ self.mlp_ratio = mlp_ratio
298
+ self.num_heads = num_heads
299
+ self.depth = depth
300
+ self.depth_single_blocks = depth_single_blocks
301
+ self.axes_dim = axes_dim
302
+ self.theta = theta
303
+ self.qkv_bias = qkv_bias
304
+ self.time_factor = time_factor
305
+ self.out_channels = self.in_channels
306
+
307
+ if hidden_size % num_heads != 0:
308
+ raise ValueError(
309
+ f"Hidden size {hidden_size} must be divisible by num_heads {num_heads}"
310
+ )
311
+ pe_dim = hidden_size // num_heads
312
+ if sum(axes_dim) != pe_dim:
313
+ raise ValueError(f"Got {axes_dim} but expected positional dim {pe_dim}")
314
+ self.hidden_size = hidden_size
315
+ self.num_heads = num_heads
316
+ self.latent_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
317
+ self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
318
+ self.cond_in = nn.Linear(context_in_dim, self.hidden_size)
319
+
320
+ self.double_blocks = nn.ModuleList(
321
+ [
322
+ DoubleStreamBlock(
323
+ self.hidden_size,
324
+ self.num_heads,
325
+ mlp_ratio=mlp_ratio,
326
+ qkv_bias=qkv_bias,
327
+ )
328
+ for _ in range(depth)
329
+ ]
330
+ )
331
+
332
+ self.single_blocks = nn.ModuleList(
333
+ [
334
+ SingleStreamBlock(
335
+ self.hidden_size,
336
+ self.num_heads,
337
+ mlp_ratio=mlp_ratio,
338
+ )
339
+ for _ in range(depth_single_blocks)
340
+ ]
341
+ )
342
+
343
+ self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
344
+
345
+ if ckpt_path is not None:
346
+ print('restored denoiser ckpt', ckpt_path)
347
+
348
+ ckpt = torch.load(ckpt_path, map_location="cpu")
349
+ if 'state_dict' not in ckpt:
350
+ # deepspeed ckpt
351
+ state_dict = {}
352
+ for k in ckpt.keys():
353
+ new_k = k.replace('_forward_module.', '')
354
+ state_dict[new_k] = ckpt[k]
355
+ else:
356
+ state_dict = ckpt["state_dict"]
357
+
358
+ final_state_dict = {}
359
+ for k, v in state_dict.items():
360
+ if k.startswith('model.'):
361
+ final_state_dict[k.replace('model.', '')] = v
362
+ else:
363
+ final_state_dict[k] = v
364
+ missing, unexpected = self.load_state_dict(final_state_dict, strict=False)
365
+ print('unexpected keys:', unexpected)
366
+ print('missing keys:', missing)
367
+
368
+ def forward(
369
+ self,
370
+ x,
371
+ t,
372
+ contexts,
373
+ **kwargs,
374
+ ) -> Tensor:
375
+ cond = contexts['main']
376
+ latent = self.latent_in(x)
377
+ vec = self.time_in(timestep_embedding(t, 256, self.time_factor).to(dtype=latent.dtype))
378
+ cond = self.cond_in(cond)
379
+ pe = None
380
+
381
+ for block in self.double_blocks:
382
+ latent, cond = block(img=latent, txt=cond, vec=vec, pe=pe)
383
+
384
+ latent = torch.cat((cond, latent), 1)
385
+ for block in self.single_blocks:
386
+ latent = block(latent, vec=vec, pe=pe)
387
+
388
+ latent = latent[:, cond.shape[1]:, ...]
389
+ latent = self.final_layer(latent, vec)
390
+ return latent
build/lib/hy3dgen/shapegen/models/vae.py ADDED
@@ -0,0 +1,636 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source Model Licensed under the Apache License Version 2.0
2
+ # and Other Licenses of the Third-Party Components therein:
3
+ # The below Model in this distribution may have been modified by THL A29 Limited
4
+ # ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
5
+
6
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
7
+ # The below software and/or models in this distribution may have been
8
+ # modified by THL A29 Limited ("Tencent Modifications").
9
+ # All Tencent Modifications are Copyright (C) THL A29 Limited.
10
+
11
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
12
+ # except for the third-party components listed below.
13
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
14
+ # in the repsective licenses of these third-party components.
15
+ # Users must comply with all terms and conditions of original licenses of these third-party
16
+ # components and must ensure that the usage of the third party components adheres to
17
+ # all relevant laws and regulations.
18
+
19
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
20
+ # their software and algorithms, including trained model weights, parameters (including
21
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
22
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
23
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
24
+
25
+ from typing import Tuple, List, Union, Optional
26
+
27
+ import numpy as np
28
+ import torch
29
+ import torch.nn as nn
30
+ import torch.nn.functional as F
31
+ from einops import rearrange, repeat
32
+ from skimage import measure
33
+ from tqdm import tqdm
34
+
35
+
36
+ class FourierEmbedder(nn.Module):
37
+ """The sin/cosine positional embedding. Given an input tensor `x` of shape [n_batch, ..., c_dim], it converts
38
+ each feature dimension of `x[..., i]` into:
39
+ [
40
+ sin(x[..., i]),
41
+ sin(f_1*x[..., i]),
42
+ sin(f_2*x[..., i]),
43
+ ...
44
+ sin(f_N * x[..., i]),
45
+ cos(x[..., i]),
46
+ cos(f_1*x[..., i]),
47
+ cos(f_2*x[..., i]),
48
+ ...
49
+ cos(f_N * x[..., i]),
50
+ x[..., i] # only present if include_input is True.
51
+ ], here f_i is the frequency.
52
+
53
+ Denote the space is [0 / num_freqs, 1 / num_freqs, 2 / num_freqs, 3 / num_freqs, ..., (num_freqs - 1) / num_freqs].
54
+ If logspace is True, then the frequency f_i is [2^(0 / num_freqs), ..., 2^(i / num_freqs), ...];
55
+ Otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1)].
56
+
57
+ Args:
58
+ num_freqs (int): the number of frequencies, default is 6;
59
+ logspace (bool): If logspace is True, then the frequency f_i is [..., 2^(i / num_freqs), ...],
60
+ otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1)];
61
+ input_dim (int): the input dimension, default is 3;
62
+ include_input (bool): include the input tensor or not, default is True.
63
+
64
+ Attributes:
65
+ frequencies (torch.Tensor): If logspace is True, then the frequency f_i is [..., 2^(i / num_freqs), ...],
66
+ otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1);
67
+
68
+ out_dim (int): the embedding size, if include_input is True, it is input_dim * (num_freqs * 2 + 1),
69
+ otherwise, it is input_dim * num_freqs * 2.
70
+
71
+ """
72
+
73
+ def __init__(self,
74
+ num_freqs: int = 6,
75
+ logspace: bool = True,
76
+ input_dim: int = 3,
77
+ include_input: bool = True,
78
+ include_pi: bool = True) -> None:
79
+
80
+ """The initialization"""
81
+
82
+ super().__init__()
83
+
84
+ if logspace:
85
+ frequencies = 2.0 ** torch.arange(
86
+ num_freqs,
87
+ dtype=torch.float32
88
+ )
89
+ else:
90
+ frequencies = torch.linspace(
91
+ 1.0,
92
+ 2.0 ** (num_freqs - 1),
93
+ num_freqs,
94
+ dtype=torch.float32
95
+ )
96
+
97
+ if include_pi:
98
+ frequencies *= torch.pi
99
+
100
+ self.register_buffer("frequencies", frequencies, persistent=False)
101
+ self.include_input = include_input
102
+ self.num_freqs = num_freqs
103
+
104
+ self.out_dim = self.get_dims(input_dim)
105
+
106
+ def get_dims(self, input_dim):
107
+ temp = 1 if self.include_input or self.num_freqs == 0 else 0
108
+ out_dim = input_dim * (self.num_freqs * 2 + temp)
109
+
110
+ return out_dim
111
+
112
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
113
+ """ Forward process.
114
+
115
+ Args:
116
+ x: tensor of shape [..., dim]
117
+
118
+ Returns:
119
+ embedding: an embedding of `x` of shape [..., dim * (num_freqs * 2 + temp)]
120
+ where temp is 1 if include_input is True and 0 otherwise.
121
+ """
122
+
123
+ if self.num_freqs > 0:
124
+ embed = (x[..., None].contiguous() * self.frequencies).view(*x.shape[:-1], -1)
125
+ if self.include_input:
126
+ return torch.cat((x, embed.sin(), embed.cos()), dim=-1)
127
+ else:
128
+ return torch.cat((embed.sin(), embed.cos()), dim=-1)
129
+ else:
130
+ return x
131
+
132
+
133
+ class DropPath(nn.Module):
134
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
135
+ """
136
+
137
+ def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True):
138
+ super(DropPath, self).__init__()
139
+ self.drop_prob = drop_prob
140
+ self.scale_by_keep = scale_by_keep
141
+
142
+ def forward(self, x):
143
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
144
+
145
+ This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
146
+ the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
147
+ See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
148
+ changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
149
+ 'survival rate' as the argument.
150
+
151
+ """
152
+ if self.drop_prob == 0. or not self.training:
153
+ return x
154
+ keep_prob = 1 - self.drop_prob
155
+ shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
156
+ random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
157
+ if keep_prob > 0.0 and self.scale_by_keep:
158
+ random_tensor.div_(keep_prob)
159
+ return x * random_tensor
160
+
161
+ def extra_repr(self):
162
+ return f'drop_prob={round(self.drop_prob, 3):0.3f}'
163
+
164
+
165
+ class MLP(nn.Module):
166
+ def __init__(
167
+ self, *,
168
+ width: int,
169
+ output_width: int = None,
170
+ drop_path_rate: float = 0.0
171
+ ):
172
+ super().__init__()
173
+ self.width = width
174
+ self.c_fc = nn.Linear(width, width * 4)
175
+ self.c_proj = nn.Linear(width * 4, output_width if output_width is not None else width)
176
+ self.gelu = nn.GELU()
177
+ self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
178
+
179
+ def forward(self, x):
180
+ return self.drop_path(self.c_proj(self.gelu(self.c_fc(x))))
181
+
182
+
183
+ class QKVMultiheadCrossAttention(nn.Module):
184
+ def __init__(
185
+ self,
186
+ *,
187
+ heads: int,
188
+ n_data: Optional[int] = None,
189
+ width=None,
190
+ qk_norm=False,
191
+ norm_layer=nn.LayerNorm
192
+ ):
193
+ super().__init__()
194
+ self.heads = heads
195
+ self.n_data = n_data
196
+ self.q_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
197
+ self.k_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
198
+
199
+ def forward(self, q, kv):
200
+ _, n_ctx, _ = q.shape
201
+ bs, n_data, width = kv.shape
202
+ attn_ch = width // self.heads // 2
203
+ q = q.view(bs, n_ctx, self.heads, -1)
204
+ kv = kv.view(bs, n_data, self.heads, -1)
205
+ k, v = torch.split(kv, attn_ch, dim=-1)
206
+
207
+ q = self.q_norm(q)
208
+ k = self.k_norm(k)
209
+
210
+ q, k, v = map(lambda t: rearrange(t, 'b n h d -> b h n d', h=self.heads), (q, k, v))
211
+ out = F.scaled_dot_product_attention(q, k, v).transpose(1, 2).reshape(bs, n_ctx, -1)
212
+
213
+ return out
214
+
215
+
216
+ class MultiheadCrossAttention(nn.Module):
217
+ def __init__(
218
+ self,
219
+ *,
220
+ width: int,
221
+ heads: int,
222
+ qkv_bias: bool = True,
223
+ n_data: Optional[int] = None,
224
+ data_width: Optional[int] = None,
225
+ norm_layer=nn.LayerNorm,
226
+ qk_norm: bool = False
227
+ ):
228
+ super().__init__()
229
+ self.n_data = n_data
230
+ self.width = width
231
+ self.heads = heads
232
+ self.data_width = width if data_width is None else data_width
233
+ self.c_q = nn.Linear(width, width, bias=qkv_bias)
234
+ self.c_kv = nn.Linear(self.data_width, width * 2, bias=qkv_bias)
235
+ self.c_proj = nn.Linear(width, width)
236
+ self.attention = QKVMultiheadCrossAttention(
237
+ heads=heads,
238
+ n_data=n_data,
239
+ width=width,
240
+ norm_layer=norm_layer,
241
+ qk_norm=qk_norm
242
+ )
243
+
244
+ def forward(self, x, data):
245
+ x = self.c_q(x)
246
+ data = self.c_kv(data)
247
+ x = self.attention(x, data)
248
+ x = self.c_proj(x)
249
+ return x
250
+
251
+
252
+ class ResidualCrossAttentionBlock(nn.Module):
253
+ def __init__(
254
+ self,
255
+ *,
256
+ n_data: Optional[int] = None,
257
+ width: int,
258
+ heads: int,
259
+ data_width: Optional[int] = None,
260
+ qkv_bias: bool = True,
261
+ norm_layer=nn.LayerNorm,
262
+ qk_norm: bool = False
263
+ ):
264
+ super().__init__()
265
+
266
+ if data_width is None:
267
+ data_width = width
268
+
269
+ self.attn = MultiheadCrossAttention(
270
+ n_data=n_data,
271
+ width=width,
272
+ heads=heads,
273
+ data_width=data_width,
274
+ qkv_bias=qkv_bias,
275
+ norm_layer=norm_layer,
276
+ qk_norm=qk_norm
277
+ )
278
+ self.ln_1 = norm_layer(width, elementwise_affine=True, eps=1e-6)
279
+ self.ln_2 = norm_layer(data_width, elementwise_affine=True, eps=1e-6)
280
+ self.ln_3 = norm_layer(width, elementwise_affine=True, eps=1e-6)
281
+ self.mlp = MLP(width=width)
282
+
283
+ def forward(self, x: torch.Tensor, data: torch.Tensor):
284
+ x = x + self.attn(self.ln_1(x), self.ln_2(data))
285
+ x = x + self.mlp(self.ln_3(x))
286
+ return x
287
+
288
+
289
+ class QKVMultiheadAttention(nn.Module):
290
+ def __init__(
291
+ self,
292
+ *,
293
+ heads: int,
294
+ n_ctx: int,
295
+ width=None,
296
+ qk_norm=False,
297
+ norm_layer=nn.LayerNorm
298
+ ):
299
+ super().__init__()
300
+ self.heads = heads
301
+ self.n_ctx = n_ctx
302
+ self.q_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
303
+ self.k_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
304
+
305
+ def forward(self, qkv):
306
+ bs, n_ctx, width = qkv.shape
307
+ attn_ch = width // self.heads // 3
308
+ qkv = qkv.view(bs, n_ctx, self.heads, -1)
309
+ q, k, v = torch.split(qkv, attn_ch, dim=-1)
310
+
311
+ q = self.q_norm(q)
312
+ k = self.k_norm(k)
313
+
314
+ q, k, v = map(lambda t: rearrange(t, 'b n h d -> b h n d', h=self.heads), (q, k, v))
315
+ out = F.scaled_dot_product_attention(q, k, v).transpose(1, 2).reshape(bs, n_ctx, -1)
316
+ return out
317
+
318
+
319
+ class MultiheadAttention(nn.Module):
320
+ def __init__(
321
+ self,
322
+ *,
323
+ n_ctx: int,
324
+ width: int,
325
+ heads: int,
326
+ qkv_bias: bool,
327
+ norm_layer=nn.LayerNorm,
328
+ qk_norm: bool = False,
329
+ drop_path_rate: float = 0.0
330
+ ):
331
+ super().__init__()
332
+ self.n_ctx = n_ctx
333
+ self.width = width
334
+ self.heads = heads
335
+ self.c_qkv = nn.Linear(width, width * 3, bias=qkv_bias)
336
+ self.c_proj = nn.Linear(width, width)
337
+ self.attention = QKVMultiheadAttention(
338
+ heads=heads,
339
+ n_ctx=n_ctx,
340
+ width=width,
341
+ norm_layer=norm_layer,
342
+ qk_norm=qk_norm
343
+ )
344
+ self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
345
+
346
+ def forward(self, x):
347
+ x = self.c_qkv(x)
348
+ x = self.attention(x)
349
+ x = self.drop_path(self.c_proj(x))
350
+ return x
351
+
352
+
353
+ class ResidualAttentionBlock(nn.Module):
354
+ def __init__(
355
+ self,
356
+ *,
357
+ n_ctx: int,
358
+ width: int,
359
+ heads: int,
360
+ qkv_bias: bool = True,
361
+ norm_layer=nn.LayerNorm,
362
+ qk_norm: bool = False,
363
+ drop_path_rate: float = 0.0,
364
+ ):
365
+ super().__init__()
366
+ self.attn = MultiheadAttention(
367
+ n_ctx=n_ctx,
368
+ width=width,
369
+ heads=heads,
370
+ qkv_bias=qkv_bias,
371
+ norm_layer=norm_layer,
372
+ qk_norm=qk_norm,
373
+ drop_path_rate=drop_path_rate
374
+ )
375
+ self.ln_1 = norm_layer(width, elementwise_affine=True, eps=1e-6)
376
+ self.mlp = MLP(width=width, drop_path_rate=drop_path_rate)
377
+ self.ln_2 = norm_layer(width, elementwise_affine=True, eps=1e-6)
378
+
379
+ def forward(self, x: torch.Tensor):
380
+ x = x + self.attn(self.ln_1(x))
381
+ x = x + self.mlp(self.ln_2(x))
382
+ return x
383
+
384
+
385
+ class Transformer(nn.Module):
386
+ def __init__(
387
+ self,
388
+ *,
389
+ n_ctx: int,
390
+ width: int,
391
+ layers: int,
392
+ heads: int,
393
+ qkv_bias: bool = True,
394
+ norm_layer=nn.LayerNorm,
395
+ qk_norm: bool = False,
396
+ drop_path_rate: float = 0.0
397
+ ):
398
+ super().__init__()
399
+ self.n_ctx = n_ctx
400
+ self.width = width
401
+ self.layers = layers
402
+ self.resblocks = nn.ModuleList(
403
+ [
404
+ ResidualAttentionBlock(
405
+ n_ctx=n_ctx,
406
+ width=width,
407
+ heads=heads,
408
+ qkv_bias=qkv_bias,
409
+ norm_layer=norm_layer,
410
+ qk_norm=qk_norm,
411
+ drop_path_rate=drop_path_rate
412
+ )
413
+ for _ in range(layers)
414
+ ]
415
+ )
416
+
417
+ def forward(self, x: torch.Tensor):
418
+ for block in self.resblocks:
419
+ x = block(x)
420
+ return x
421
+
422
+
423
+ class CrossAttentionDecoder(nn.Module):
424
+
425
+ def __init__(
426
+ self,
427
+ *,
428
+ num_latents: int,
429
+ out_channels: int,
430
+ fourier_embedder: FourierEmbedder,
431
+ width: int,
432
+ heads: int,
433
+ qkv_bias: bool = True,
434
+ qk_norm: bool = False,
435
+ label_type: str = "binary"
436
+ ):
437
+ super().__init__()
438
+
439
+ self.fourier_embedder = fourier_embedder
440
+
441
+ self.query_proj = nn.Linear(self.fourier_embedder.out_dim, width)
442
+
443
+ self.cross_attn_decoder = ResidualCrossAttentionBlock(
444
+ n_data=num_latents,
445
+ width=width,
446
+ heads=heads,
447
+ qkv_bias=qkv_bias,
448
+ qk_norm=qk_norm
449
+ )
450
+
451
+ self.ln_post = nn.LayerNorm(width)
452
+ self.output_proj = nn.Linear(width, out_channels)
453
+ self.label_type = label_type
454
+
455
+ def forward(self, queries: torch.FloatTensor, latents: torch.FloatTensor):
456
+ queries = self.query_proj(self.fourier_embedder(queries).to(latents.dtype))
457
+ x = self.cross_attn_decoder(queries, latents)
458
+ x = self.ln_post(x)
459
+ occ = self.output_proj(x)
460
+ return occ
461
+
462
+
463
+ def generate_dense_grid_points(bbox_min: np.ndarray,
464
+ bbox_max: np.ndarray,
465
+ octree_depth: int,
466
+ indexing: str = "ij",
467
+ octree_resolution: int = None,
468
+ ):
469
+ length = bbox_max - bbox_min
470
+ num_cells = np.exp2(octree_depth)
471
+ if octree_resolution is not None:
472
+ num_cells = octree_resolution
473
+
474
+ x = np.linspace(bbox_min[0], bbox_max[0], int(num_cells) + 1, dtype=np.float32)
475
+ y = np.linspace(bbox_min[1], bbox_max[1], int(num_cells) + 1, dtype=np.float32)
476
+ z = np.linspace(bbox_min[2], bbox_max[2], int(num_cells) + 1, dtype=np.float32)
477
+ [xs, ys, zs] = np.meshgrid(x, y, z, indexing=indexing)
478
+ xyz = np.stack((xs, ys, zs), axis=-1)
479
+ xyz = xyz.reshape(-1, 3)
480
+ grid_size = [int(num_cells) + 1, int(num_cells) + 1, int(num_cells) + 1]
481
+
482
+ return xyz, grid_size, length
483
+
484
+
485
+ def center_vertices(vertices):
486
+ """Translate the vertices so that bounding box is centered at zero."""
487
+ vert_min = vertices.min(dim=0)[0]
488
+ vert_max = vertices.max(dim=0)[0]
489
+ vert_center = 0.5 * (vert_min + vert_max)
490
+ return vertices - vert_center
491
+
492
+
493
+ class Latent2MeshOutput:
494
+
495
+ def __init__(self, mesh_v=None, mesh_f=None):
496
+ self.mesh_v = mesh_v
497
+ self.mesh_f = mesh_f
498
+
499
+
500
+ class ShapeVAE(nn.Module):
501
+ def __init__(
502
+ self,
503
+ *,
504
+ num_latents: int,
505
+ embed_dim: int,
506
+ width: int,
507
+ heads: int,
508
+ num_decoder_layers: int,
509
+ num_freqs: int = 8,
510
+ include_pi: bool = True,
511
+ qkv_bias: bool = True,
512
+ qk_norm: bool = False,
513
+ label_type: str = "binary",
514
+ drop_path_rate: float = 0.0,
515
+ scale_factor: float = 1.0,
516
+ ):
517
+ super().__init__()
518
+ self.fourier_embedder = FourierEmbedder(num_freqs=num_freqs, include_pi=include_pi)
519
+
520
+ self.post_kl = nn.Linear(embed_dim, width)
521
+
522
+ self.transformer = Transformer(
523
+ n_ctx=num_latents,
524
+ width=width,
525
+ layers=num_decoder_layers,
526
+ heads=heads,
527
+ qkv_bias=qkv_bias,
528
+ qk_norm=qk_norm,
529
+ drop_path_rate=drop_path_rate
530
+ )
531
+
532
+ self.geo_decoder = CrossAttentionDecoder(
533
+ fourier_embedder=self.fourier_embedder,
534
+ out_channels=1,
535
+ num_latents=num_latents,
536
+ width=width,
537
+ heads=heads,
538
+ qkv_bias=qkv_bias,
539
+ qk_norm=qk_norm,
540
+ label_type=label_type,
541
+ )
542
+
543
+ self.scale_factor = scale_factor
544
+ self.latent_shape = (num_latents, embed_dim)
545
+
546
+ def forward(self, latents):
547
+ latents = self.post_kl(latents)
548
+ latents = self.transformer(latents)
549
+ return latents
550
+
551
+ @torch.no_grad()
552
+ def latents2mesh(
553
+ self,
554
+ latents: torch.FloatTensor,
555
+ bounds: Union[Tuple[float], List[float], float] = 1.1,
556
+ octree_depth: int = 7,
557
+ num_chunks: int = 10000,
558
+ mc_level: float = -1 / 512,
559
+ octree_resolution: int = None,
560
+ mc_algo: str = 'dmc',
561
+ ):
562
+ device = latents.device
563
+
564
+ # 1. generate query points
565
+ if isinstance(bounds, float):
566
+ bounds = [-bounds, -bounds, -bounds, bounds, bounds, bounds]
567
+ bbox_min = np.array(bounds[0:3])
568
+ bbox_max = np.array(bounds[3:6])
569
+ bbox_size = bbox_max - bbox_min
570
+ xyz_samples, grid_size, length = generate_dense_grid_points(
571
+ bbox_min=bbox_min,
572
+ bbox_max=bbox_max,
573
+ octree_depth=octree_depth,
574
+ octree_resolution=octree_resolution,
575
+ indexing="ij"
576
+ )
577
+ xyz_samples = torch.FloatTensor(xyz_samples)
578
+
579
+ # 2. latents to 3d volume
580
+ batch_logits = []
581
+ batch_size = latents.shape[0]
582
+ for start in tqdm(range(0, xyz_samples.shape[0], num_chunks),
583
+ desc=f"MC Level {mc_level} Implicit Function:"):
584
+ queries = xyz_samples[start: start + num_chunks, :].to(device)
585
+ queries = queries.half()
586
+ batch_queries = repeat(queries, "p c -> b p c", b=batch_size)
587
+
588
+ logits = self.geo_decoder(batch_queries.to(latents.dtype), latents)
589
+ if mc_level == -1:
590
+ mc_level = 0
591
+ logits = torch.sigmoid(logits) * 2 - 1
592
+ print(f'Training with soft labels, inference with sigmoid and marching cubes level 0.')
593
+ batch_logits.append(logits)
594
+ grid_logits = torch.cat(batch_logits, dim=1)
595
+ grid_logits = grid_logits.view((batch_size, grid_size[0], grid_size[1], grid_size[2])).float()
596
+
597
+ # 3. extract surface
598
+ outputs = []
599
+ for i in range(batch_size):
600
+ try:
601
+ if mc_algo == 'mc':
602
+ vertices, faces, normals, _ = measure.marching_cubes(
603
+ grid_logits[i].cpu().numpy(),
604
+ mc_level,
605
+ method="lewiner"
606
+ )
607
+ vertices = vertices / grid_size * bbox_size + bbox_min
608
+ elif mc_algo == 'dmc':
609
+ if not hasattr(self, 'dmc'):
610
+ try:
611
+ from diso import DiffDMC
612
+ except:
613
+ raise ImportError("Please install diso via `pip install diso`, or set mc_algo to 'mc'")
614
+ self.dmc = DiffDMC(dtype=torch.float32).to(device)
615
+ octree_resolution = 2 ** octree_depth if octree_resolution is None else octree_resolution
616
+ sdf = -grid_logits[i] / octree_resolution
617
+ verts, faces = self.dmc(sdf, deform=None, return_quads=False, normalize=True)
618
+ verts = center_vertices(verts)
619
+ vertices = verts.detach().cpu().numpy()
620
+ faces = faces.detach().cpu().numpy()[:, ::-1]
621
+ else:
622
+ raise ValueError(f"mc_algo {mc_algo} not supported.")
623
+
624
+ outputs.append(
625
+ Latent2MeshOutput(
626
+ mesh_v=vertices.astype(np.float32),
627
+ mesh_f=np.ascontiguousarray(faces)
628
+ )
629
+ )
630
+
631
+ except ValueError:
632
+ outputs.append(None)
633
+ except RuntimeError:
634
+ outputs.append(None)
635
+
636
+ return outputs
build/lib/hy3dgen/shapegen/pipelines.py ADDED
@@ -0,0 +1,589 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source Model Licensed under the Apache License Version 2.0
2
+ # and Other Licenses of the Third-Party Components therein:
3
+ # The below Model in this distribution may have been modified by THL A29 Limited
4
+ # ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
5
+
6
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
7
+ # The below software and/or models in this distribution may have been
8
+ # modified by THL A29 Limited ("Tencent Modifications").
9
+ # All Tencent Modifications are Copyright (C) THL A29 Limited.
10
+
11
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
12
+ # except for the third-party components listed below.
13
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
14
+ # in the repsective licenses of these third-party components.
15
+ # Users must comply with all terms and conditions of original licenses of these third-party
16
+ # components and must ensure that the usage of the third party components adheres to
17
+ # all relevant laws and regulations.
18
+
19
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
20
+ # their software and algorithms, including trained model weights, parameters (including
21
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
22
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
23
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
24
+
25
+ import copy
26
+ import importlib
27
+ import inspect
28
+ import logging
29
+ import os
30
+ from typing import List, Optional, Union
31
+
32
+ import numpy as np
33
+ import torch
34
+ import trimesh
35
+ import yaml
36
+ from PIL import Image
37
+ from diffusers.utils.torch_utils import randn_tensor
38
+ from tqdm import tqdm
39
+
40
+ logger = logging.getLogger(__name__)
41
+
42
+
43
+ def retrieve_timesteps(
44
+ scheduler,
45
+ num_inference_steps: Optional[int] = None,
46
+ device: Optional[Union[str, torch.device]] = None,
47
+ timesteps: Optional[List[int]] = None,
48
+ sigmas: Optional[List[float]] = None,
49
+ **kwargs,
50
+ ):
51
+ """
52
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
53
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
54
+
55
+ Args:
56
+ scheduler (`SchedulerMixin`):
57
+ The scheduler to get timesteps from.
58
+ num_inference_steps (`int`):
59
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
60
+ must be `None`.
61
+ device (`str` or `torch.device`, *optional*):
62
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
63
+ timesteps (`List[int]`, *optional*):
64
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
65
+ `num_inference_steps` and `sigmas` must be `None`.
66
+ sigmas (`List[float]`, *optional*):
67
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
68
+ `num_inference_steps` and `timesteps` must be `None`.
69
+
70
+ Returns:
71
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
72
+ second element is the number of inference steps.
73
+ """
74
+ if timesteps is not None and sigmas is not None:
75
+ raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
76
+ if timesteps is not None:
77
+ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
78
+ if not accepts_timesteps:
79
+ raise ValueError(
80
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
81
+ f" timestep schedules. Please check whether you are using the correct scheduler."
82
+ )
83
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
84
+ timesteps = scheduler.timesteps
85
+ num_inference_steps = len(timesteps)
86
+ elif sigmas is not None:
87
+ accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
88
+ if not accept_sigmas:
89
+ raise ValueError(
90
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
91
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
92
+ )
93
+ scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
94
+ timesteps = scheduler.timesteps
95
+ num_inference_steps = len(timesteps)
96
+ else:
97
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
98
+ timesteps = scheduler.timesteps
99
+ return timesteps, num_inference_steps
100
+
101
+
102
+ def export_to_trimesh(mesh_output):
103
+ if isinstance(mesh_output, list):
104
+ outputs = []
105
+ for mesh in mesh_output:
106
+ if mesh is None:
107
+ outputs.append(None)
108
+ else:
109
+ mesh.mesh_f = mesh.mesh_f[:, ::-1]
110
+ mesh_output = trimesh.Trimesh(mesh.mesh_v, mesh.mesh_f)
111
+ outputs.append(mesh_output)
112
+ return outputs
113
+ else:
114
+ mesh_output.mesh_f = mesh_output.mesh_f[:, ::-1]
115
+ mesh_output = trimesh.Trimesh(mesh_output.mesh_v, mesh_output.mesh_f)
116
+ return mesh_output
117
+
118
+
119
+ def get_obj_from_str(string, reload=False):
120
+ module, cls = string.rsplit(".", 1)
121
+ if reload:
122
+ module_imp = importlib.import_module(module)
123
+ importlib.reload(module_imp)
124
+ return getattr(importlib.import_module(module, package=None), cls)
125
+
126
+
127
+ def instantiate_from_config(config, **kwargs):
128
+ if "target" not in config:
129
+ raise KeyError("Expected key `target` to instantiate.")
130
+ cls = get_obj_from_str(config["target"])
131
+ params = config.get("params", dict())
132
+ kwargs.update(params)
133
+ instance = cls(**kwargs)
134
+ return instance
135
+
136
+
137
+ class Hunyuan3DDiTPipeline:
138
+ @classmethod
139
+ def from_single_file(
140
+ cls,
141
+ ckpt_path,
142
+ config_path,
143
+ device='cpu',
144
+ dtype=torch.float16,
145
+ **kwargs,
146
+ ):
147
+ # load config
148
+ with open(config_path, 'r') as f:
149
+ config = yaml.safe_load(f)
150
+
151
+ # load ckpt
152
+ if not os.path.exists(ckpt_path):
153
+ raise FileNotFoundError(f"Model file {ckpt_path} not found")
154
+ logger.info(f"Loading model from {ckpt_path}")
155
+
156
+ if ckpt_path.endswith('.safetensors'):
157
+ # parse safetensors
158
+ import safetensors.torch
159
+ safetensors_ckpt = safetensors.torch.load_file(ckpt_path, device='cpu')
160
+ ckpt = {}
161
+ for key, value in safetensors_ckpt.items():
162
+ model_name = key.split('.')[0]
163
+ new_key = key[len(model_name) + 1:]
164
+ if model_name not in ckpt:
165
+ ckpt[model_name] = {}
166
+ ckpt[model_name][new_key] = value
167
+ else:
168
+ ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=True)
169
+
170
+ # load model
171
+ from accelerate import init_empty_weights
172
+ with init_empty_weights():
173
+ model = instantiate_from_config(config['model'])
174
+ vae = instantiate_from_config(config['vae'])
175
+ conditioner = instantiate_from_config(config['conditioner'])
176
+ image_processor = instantiate_from_config(config['image_processor'])
177
+ scheduler = instantiate_from_config(config['scheduler'])
178
+
179
+ model.load_state_dict(ckpt['model'], assign = True)
180
+ vae.load_state_dict(ckpt['vae'], assign = True)
181
+ if 'conditioner' in ckpt:
182
+ conditioner.load_state_dict(ckpt['conditioner'], assign = True)
183
+
184
+ model_kwargs = dict(
185
+ vae=vae,
186
+ model=model,
187
+ scheduler=scheduler,
188
+ conditioner=conditioner,
189
+ image_processor=image_processor,
190
+ device=device,
191
+ dtype=dtype,
192
+ )
193
+ model_kwargs.update(kwargs)
194
+
195
+ return cls(
196
+ **model_kwargs
197
+ )
198
+
199
+ @classmethod
200
+ def from_pretrained(
201
+ cls,
202
+ model_path,
203
+ device='cuda',
204
+ dtype=torch.float16,
205
+ use_safetensors=None,
206
+ variant=None,
207
+ subfolder='hunyuan3d-dit-v2-0',
208
+ **kwargs,
209
+ ):
210
+ original_model_path = model_path
211
+ if not os.path.exists(model_path):
212
+ # try local path
213
+ base_dir = os.environ.get('HY3DGEN_MODELS', '~/.cache/hy3dgen')
214
+ model_path = os.path.expanduser(os.path.join(base_dir, model_path, subfolder))
215
+ if not os.path.exists(model_path):
216
+ try:
217
+ import huggingface_hub
218
+ # download from huggingface
219
+ path = huggingface_hub.snapshot_download(repo_id=original_model_path)
220
+ model_path = os.path.join(path, subfolder)
221
+ except ImportError:
222
+ logger.warning(
223
+ "You need to install HuggingFace Hub to load models from the hub."
224
+ )
225
+ raise RuntimeError(f"Model path {model_path} not found")
226
+ if not os.path.exists(model_path):
227
+ raise FileNotFoundError(f"Model path {original_model_path} not found")
228
+
229
+ extension = 'ckpt' if not use_safetensors else 'safetensors'
230
+ variant = '' if variant is None else f'.{variant}'
231
+ ckpt_name = f'model{variant}.{extension}'
232
+ config_path = os.path.join(model_path, 'config.yaml')
233
+ ckpt_path = os.path.join(model_path, ckpt_name)
234
+
235
+ return cls.from_single_file(
236
+ ckpt_path,
237
+ config_path,
238
+ device=device,
239
+ dtype=dtype,
240
+ use_safetensors=use_safetensors,
241
+ variant=variant,
242
+ **kwargs
243
+ )
244
+
245
+ def __init__(
246
+ self,
247
+ vae,
248
+ model,
249
+ scheduler,
250
+ conditioner,
251
+ image_processor,
252
+ device='cuda',
253
+ dtype=torch.float16,
254
+ **kwargs
255
+ ):
256
+ self.vae = vae
257
+ self.model = model
258
+ self.scheduler = scheduler
259
+ self.conditioner = conditioner
260
+ self.image_processor = image_processor
261
+
262
+ self.to(device, dtype)
263
+
264
+ def to(self, device=None, dtype=None):
265
+ if device is not None:
266
+ self.device = torch.device(device)
267
+ self.vae.to(device)
268
+ self.model.to(device)
269
+ self.conditioner.to(device)
270
+ if dtype is not None:
271
+ self.dtype = dtype
272
+ self.vae.to(dtype=dtype)
273
+ self.model.to(dtype=dtype)
274
+ self.conditioner.to(dtype=dtype)
275
+
276
+ def encode_cond(self, image, mask, do_classifier_free_guidance, dual_guidance):
277
+ bsz = image.shape[0]
278
+ cond = self.conditioner(image=image, mask=mask)
279
+
280
+ if do_classifier_free_guidance:
281
+ un_cond = self.conditioner.unconditional_embedding(bsz)
282
+
283
+ if dual_guidance:
284
+ un_cond_drop_main = copy.deepcopy(un_cond)
285
+ un_cond_drop_main['additional'] = cond['additional']
286
+
287
+ def cat_recursive(a, b, c):
288
+ if isinstance(a, torch.Tensor):
289
+ return torch.cat([a, b, c], dim=0).to(self.dtype)
290
+ out = {}
291
+ for k in a.keys():
292
+ out[k] = cat_recursive(a[k], b[k], c[k])
293
+ return out
294
+
295
+ cond = cat_recursive(cond, un_cond_drop_main, un_cond)
296
+ else:
297
+ un_cond = self.conditioner.unconditional_embedding(bsz)
298
+
299
+ def cat_recursive(a, b):
300
+ if isinstance(a, torch.Tensor):
301
+ return torch.cat([a, b], dim=0).to(self.dtype)
302
+ out = {}
303
+ for k in a.keys():
304
+ out[k] = cat_recursive(a[k], b[k])
305
+ return out
306
+
307
+ cond = cat_recursive(cond, un_cond)
308
+ return cond
309
+
310
+ def prepare_extra_step_kwargs(self, generator, eta):
311
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
312
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
313
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
314
+ # and should be between [0, 1]
315
+
316
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
317
+ extra_step_kwargs = {}
318
+ if accepts_eta:
319
+ extra_step_kwargs["eta"] = eta
320
+
321
+ # check if the scheduler accepts generator
322
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
323
+ if accepts_generator:
324
+ extra_step_kwargs["generator"] = generator
325
+ return extra_step_kwargs
326
+
327
+ def prepare_latents(self, batch_size, dtype, device, generator, latents=None):
328
+ shape = (batch_size, *self.vae.latent_shape)
329
+ if isinstance(generator, list) and len(generator) != batch_size:
330
+ raise ValueError(
331
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
332
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
333
+ )
334
+
335
+ if latents is None:
336
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
337
+ else:
338
+ latents = latents.to(device)
339
+
340
+ # scale the initial noise by the standard deviation required by the scheduler
341
+ latents = latents * getattr(self.scheduler, 'init_noise_sigma', 1.0)
342
+ return latents
343
+
344
+ def prepare_image(self, image):
345
+ if isinstance(image, str) and not os.path.exists(image):
346
+ raise FileNotFoundError(f"Couldn't find image at path {image}")
347
+
348
+ if not isinstance(image, list):
349
+ image = [image]
350
+ image_pts = []
351
+ mask_pts = []
352
+ for img in image:
353
+ image_pt, mask_pt = self.image_processor(img, return_mask=True)
354
+ image_pts.append(image_pt)
355
+ mask_pts.append(mask_pt)
356
+
357
+ image_pts = torch.cat(image_pts, dim=0).to(self.device, dtype=self.dtype)
358
+ if mask_pts[0] is not None:
359
+ mask_pts = torch.cat(mask_pts, dim=0).to(self.device, dtype=self.dtype)
360
+ else:
361
+ mask_pts = None
362
+ return image_pts, mask_pts
363
+
364
+ def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
365
+ """
366
+ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
367
+
368
+ Args:
369
+ timesteps (`torch.Tensor`):
370
+ generate embedding vectors at these timesteps
371
+ embedding_dim (`int`, *optional*, defaults to 512):
372
+ dimension of the embeddings to generate
373
+ dtype:
374
+ data type of the generated embeddings
375
+
376
+ Returns:
377
+ `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
378
+ """
379
+ assert len(w.shape) == 1
380
+ w = w * 1000.0
381
+
382
+ half_dim = embedding_dim // 2
383
+ emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
384
+ emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
385
+ emb = w.to(dtype)[:, None] * emb[None, :]
386
+ emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
387
+ if embedding_dim % 2 == 1: # zero pad
388
+ emb = torch.nn.functional.pad(emb, (0, 1))
389
+ assert emb.shape == (w.shape[0], embedding_dim)
390
+ return emb
391
+
392
+ @torch.no_grad()
393
+ def __call__(
394
+ self,
395
+ image: Union[str, List[str], Image.Image] = None,
396
+ num_inference_steps: int = 50,
397
+ timesteps: List[int] = None,
398
+ sigmas: List[float] = None,
399
+ eta: float = 0.0,
400
+ guidance_scale: float = 7.5,
401
+ dual_guidance_scale: float = 10.5,
402
+ dual_guidance: bool = True,
403
+ generator=None,
404
+ box_v=1.01,
405
+ octree_resolution=384,
406
+ mc_level=-1 / 512,
407
+ num_chunks=8000,
408
+ mc_algo='mc',
409
+ output_type: Optional[str] = "trimesh",
410
+ enable_pbar=True,
411
+ **kwargs,
412
+ ) -> List[List[trimesh.Trimesh]]:
413
+ callback = kwargs.pop("callback", None)
414
+ callback_steps = kwargs.pop("callback_steps", None)
415
+
416
+ device = self.device
417
+ dtype = self.dtype
418
+ do_classifier_free_guidance = guidance_scale >= 0 and \
419
+ getattr(self.model, 'guidance_cond_proj_dim', None) is None
420
+ dual_guidance = dual_guidance_scale >= 0 and dual_guidance
421
+
422
+ image, mask = self.prepare_image(image)
423
+ cond = self.encode_cond(image=image,
424
+ mask=mask,
425
+ do_classifier_free_guidance=do_classifier_free_guidance,
426
+ dual_guidance=dual_guidance)
427
+ batch_size = image.shape[0]
428
+
429
+ t_dtype = torch.long
430
+ timesteps, num_inference_steps = retrieve_timesteps(
431
+ self.scheduler, num_inference_steps, device, timesteps, sigmas)
432
+
433
+ latents = self.prepare_latents(batch_size, dtype, device, generator)
434
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
435
+
436
+ guidance_cond = None
437
+ if getattr(self.model, 'guidance_cond_proj_dim', None) is not None:
438
+ print('Using lcm guidance scale')
439
+ guidance_scale_tensor = torch.tensor(guidance_scale - 1).repeat(batch_size)
440
+ guidance_cond = self.get_guidance_scale_embedding(
441
+ guidance_scale_tensor, embedding_dim=self.model.guidance_cond_proj_dim
442
+ ).to(device=device, dtype=latents.dtype)
443
+
444
+ for i, t in enumerate(tqdm(timesteps, disable=not enable_pbar, desc="Diffusion Sampling:", leave=False)):
445
+ # expand the latents if we are doing classifier free guidance
446
+ if do_classifier_free_guidance:
447
+ latent_model_input = torch.cat([latents] * (3 if dual_guidance else 2))
448
+ else:
449
+ latent_model_input = latents
450
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
451
+
452
+ # predict the noise residual
453
+ timestep_tensor = torch.tensor([t], dtype=t_dtype, device=device)
454
+ timestep_tensor = timestep_tensor.expand(latent_model_input.shape[0])
455
+ noise_pred = self.model(latent_model_input, timestep_tensor, cond, guidance_cond=guidance_cond)
456
+
457
+ # no drop, drop clip, all drop
458
+ if do_classifier_free_guidance:
459
+ if dual_guidance:
460
+ noise_pred_clip, noise_pred_dino, noise_pred_uncond = noise_pred.chunk(3)
461
+ noise_pred = (
462
+ noise_pred_uncond
463
+ + guidance_scale * (noise_pred_clip - noise_pred_dino)
464
+ + dual_guidance_scale * (noise_pred_dino - noise_pred_uncond)
465
+ )
466
+ else:
467
+ noise_pred_cond, noise_pred_uncond = noise_pred.chunk(2)
468
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
469
+
470
+ # compute the previous noisy sample x_t -> x_t-1
471
+ outputs = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)
472
+ latents = outputs.prev_sample
473
+
474
+ if callback is not None and i % callback_steps == 0:
475
+ step_idx = i // getattr(self.scheduler, "order", 1)
476
+ callback(step_idx, t, outputs)
477
+
478
+ return self._export(
479
+ latents,
480
+ output_type,
481
+ box_v, mc_level, num_chunks, octree_resolution, mc_algo,
482
+ )
483
+
484
+ def _export(self, latents, output_type, box_v, mc_level, num_chunks, octree_resolution, mc_algo):
485
+ if not output_type == "latent":
486
+ latents = 1. / self.vae.scale_factor * latents
487
+ latents = self.vae(latents)
488
+ outputs = self.vae.latents2mesh(
489
+ latents,
490
+ bounds=box_v,
491
+ mc_level=mc_level,
492
+ num_chunks=num_chunks,
493
+ octree_resolution=octree_resolution,
494
+ mc_algo=mc_algo,
495
+ )
496
+ else:
497
+ outputs = latents
498
+
499
+ if output_type == 'trimesh':
500
+ outputs = export_to_trimesh(outputs)
501
+
502
+ return outputs
503
+
504
+
505
+ class Hunyuan3DDiTFlowMatchingPipeline(Hunyuan3DDiTPipeline):
506
+
507
+ @torch.no_grad()
508
+ def __call__(
509
+ self,
510
+ image: Union[str, List[str], Image.Image] = None,
511
+ num_inference_steps: int = 50,
512
+ timesteps: List[int] = None,
513
+ sigmas: List[float] = None,
514
+ eta: float = 0.0,
515
+ guidance_scale: float = 7.5,
516
+ generator=None,
517
+ box_v=1.01,
518
+ octree_resolution=384,
519
+ mc_level=0.0,
520
+ mc_algo='mc',
521
+ num_chunks=8000,
522
+ output_type: Optional[str] = "trimesh",
523
+ enable_pbar=True,
524
+ **kwargs,
525
+ ) -> List[List[trimesh.Trimesh]]:
526
+ callback = kwargs.pop("callback", None)
527
+ callback_steps = kwargs.pop("callback_steps", None)
528
+
529
+ device = self.device
530
+ dtype = self.dtype
531
+ do_classifier_free_guidance = guidance_scale >= 0 and not (
532
+ hasattr(self.model, 'guidance_embed') and
533
+ self.model.guidance_embed is True
534
+ )
535
+
536
+ image, mask = self.prepare_image(image)
537
+ cond = self.encode_cond(
538
+ image=image,
539
+ mask=mask,
540
+ do_classifier_free_guidance=do_classifier_free_guidance,
541
+ dual_guidance=False,
542
+ )
543
+ batch_size = image.shape[0]
544
+
545
+ # 5. Prepare timesteps
546
+ # NOTE: this is slightly different from common usage, we start from 0.
547
+ sigmas = np.linspace(0, 1, num_inference_steps) if sigmas is None else sigmas
548
+ timesteps, num_inference_steps = retrieve_timesteps(
549
+ self.scheduler,
550
+ num_inference_steps,
551
+ device,
552
+ sigmas=sigmas,
553
+ )
554
+ latents = self.prepare_latents(batch_size, dtype, device, generator)
555
+
556
+ guidance = None
557
+ if hasattr(self.model, 'guidance_embed') and \
558
+ self.model.guidance_embed is True:
559
+ guidance = torch.tensor([guidance_scale] * batch_size, device=device, dtype=dtype)
560
+
561
+ for i, t in enumerate(tqdm(timesteps, disable=not enable_pbar, desc="Diffusion Sampling:")):
562
+ # expand the latents if we are doing classifier free guidance
563
+ if do_classifier_free_guidance:
564
+ latent_model_input = torch.cat([latents] * 2)
565
+ else:
566
+ latent_model_input = latents
567
+
568
+ # NOTE: we assume model get timesteps ranged from 0 to 1
569
+ timestep = t.expand(latent_model_input.shape[0]).to(
570
+ latents.dtype) / self.scheduler.config.num_train_timesteps
571
+ noise_pred = self.model(latent_model_input, timestep, cond, guidance=guidance)
572
+
573
+ if do_classifier_free_guidance:
574
+ noise_pred_cond, noise_pred_uncond = noise_pred.chunk(2)
575
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
576
+
577
+ # compute the previous noisy sample x_t -> x_t-1
578
+ outputs = self.scheduler.step(noise_pred, t, latents)
579
+ latents = outputs.prev_sample
580
+
581
+ if callback is not None and i % callback_steps == 0:
582
+ step_idx = i // getattr(self.scheduler, "order", 1)
583
+ callback(step_idx, t, outputs)
584
+
585
+ return self._export(
586
+ latents,
587
+ output_type,
588
+ box_v, mc_level, num_chunks, octree_resolution, mc_algo,
589
+ )
build/lib/hy3dgen/shapegen/postprocessors.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source Model Licensed under the Apache License Version 2.0
2
+ # and Other Licenses of the Third-Party Components therein:
3
+ # The below Model in this distribution may have been modified by THL A29 Limited
4
+ # ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
5
+
6
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
7
+ # The below software and/or models in this distribution may have been
8
+ # modified by THL A29 Limited ("Tencent Modifications").
9
+ # All Tencent Modifications are Copyright (C) THL A29 Limited.
10
+
11
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
12
+ # except for the third-party components listed below.
13
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
14
+ # in the repsective licenses of these third-party components.
15
+ # Users must comply with all terms and conditions of original licenses of these third-party
16
+ # components and must ensure that the usage of the third party components adheres to
17
+ # all relevant laws and regulations.
18
+
19
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
20
+ # their software and algorithms, including trained model weights, parameters (including
21
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
22
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
23
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
24
+
25
+ import os
26
+ import tempfile
27
+ from typing import Union
28
+
29
+ import pymeshlab
30
+ import trimesh
31
+
32
+ from .models.vae import Latent2MeshOutput
33
+
34
+
35
+ def load_mesh(path):
36
+ if path.endswith(".glb"):
37
+ mesh = trimesh.load(path)
38
+ else:
39
+ mesh = pymeshlab.MeshSet()
40
+ mesh.load_new_mesh(path)
41
+ return mesh
42
+
43
+
44
+ def reduce_face(mesh: pymeshlab.MeshSet, max_facenum: int = 200000):
45
+ mesh.apply_filter(
46
+ "meshing_decimation_quadric_edge_collapse",
47
+ targetfacenum=max_facenum,
48
+ qualitythr=1.0,
49
+ preserveboundary=True,
50
+ boundaryweight=3,
51
+ preservenormal=True,
52
+ preservetopology=True,
53
+ autoclean=True
54
+ )
55
+ return mesh
56
+
57
+
58
+ def remove_floater(mesh: pymeshlab.MeshSet):
59
+ mesh.apply_filter("compute_selection_by_small_disconnected_components_per_face",
60
+ nbfaceratio=0.005)
61
+ mesh.apply_filter("compute_selection_transfer_face_to_vertex", inclusive=False)
62
+ mesh.apply_filter("meshing_remove_selected_vertices_and_faces")
63
+ return mesh
64
+
65
+
66
+ def pymeshlab2trimesh(mesh: pymeshlab.MeshSet):
67
+ temp_file = tempfile.NamedTemporaryFile(suffix='.ply', delete=True)
68
+ temp_file.close()
69
+ temp_file_name = temp_file.name
70
+
71
+ mesh.save_current_mesh(temp_file_name)
72
+ mesh = trimesh.load(temp_file_name)
73
+ if os.path.exists(temp_file_name):
74
+ os.remove(temp_file_name)
75
+
76
+ # 检查加载的对象类型
77
+ if isinstance(mesh, trimesh.Scene):
78
+ combined_mesh = trimesh.Trimesh()
79
+ # 如果是Scene,遍历所有的geometry并合并
80
+ for geom in mesh.geometry.values():
81
+ combined_mesh = trimesh.util.concatenate([combined_mesh, geom])
82
+ mesh = combined_mesh
83
+ return mesh
84
+
85
+
86
+ def trimesh2pymeshlab(mesh: trimesh.Trimesh):
87
+ temp_file = tempfile.NamedTemporaryFile(suffix='.ply', delete=True)
88
+ temp_file.close()
89
+ temp_file_name = temp_file.name
90
+
91
+ if isinstance(mesh, trimesh.scene.Scene):
92
+ for idx, obj in enumerate(mesh.geometry.values()):
93
+ if idx == 0:
94
+ temp_mesh = obj
95
+ else:
96
+ temp_mesh = temp_mesh + obj
97
+ mesh = temp_mesh
98
+ mesh.export(temp_file_name)
99
+ mesh = pymeshlab.MeshSet()
100
+ mesh.load_new_mesh(temp_file_name)
101
+ if os.path.exists(temp_file_name):
102
+ os.remove(temp_file_name)
103
+
104
+ return mesh
105
+
106
+
107
+ def export_mesh(input, output):
108
+ if isinstance(input, pymeshlab.MeshSet):
109
+ mesh = output
110
+ elif isinstance(input, Latent2MeshOutput):
111
+ output = Latent2MeshOutput()
112
+ output.mesh_v = output.current_mesh().vertex_matrix()
113
+ output.mesh_f = output.current_mesh().face_matrix()
114
+ mesh = output
115
+ else:
116
+ mesh = pymeshlab2trimesh(output)
117
+ return mesh
118
+
119
+
120
+ def import_mesh(mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str]) -> pymeshlab.MeshSet:
121
+ if isinstance(mesh, str):
122
+ mesh = load_mesh(mesh)
123
+ elif isinstance(mesh, Latent2MeshOutput):
124
+ mesh = pymeshlab.MeshSet()
125
+ mesh_pymeshlab = pymeshlab.Mesh(vertex_matrix=mesh.mesh_v, face_matrix=mesh.mesh_f)
126
+ mesh.add_mesh(mesh_pymeshlab, "converted_mesh")
127
+
128
+ if isinstance(mesh, (trimesh.Trimesh, trimesh.scene.Scene)):
129
+ mesh = trimesh2pymeshlab(mesh)
130
+
131
+ return mesh
132
+
133
+
134
+ class FaceReducer:
135
+ def __call__(
136
+ self,
137
+ mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str],
138
+ max_facenum: int = 40000
139
+ ) -> Union[pymeshlab.MeshSet, trimesh.Trimesh]:
140
+ ms = import_mesh(mesh)
141
+ ms = reduce_face(ms, max_facenum=max_facenum)
142
+ mesh = export_mesh(mesh, ms)
143
+ return mesh
144
+
145
+
146
+ class FloaterRemover:
147
+ def __call__(
148
+ self,
149
+ mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str],
150
+ ) -> Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput]:
151
+ ms = import_mesh(mesh)
152
+ ms = remove_floater(ms)
153
+ mesh = export_mesh(mesh, ms)
154
+ return mesh
155
+
156
+
157
+ class DegenerateFaceRemover:
158
+ def __call__(
159
+ self,
160
+ mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str],
161
+ ) -> Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput]:
162
+ ms = import_mesh(mesh)
163
+
164
+ temp_file = tempfile.NamedTemporaryFile(suffix='.ply', delete=True)
165
+ temp_file.close()
166
+ temp_file_name = temp_file.name
167
+
168
+ ms.save_current_mesh(temp_file_name)
169
+ ms = pymeshlab.MeshSet()
170
+ ms.load_new_mesh(temp_file_name)
171
+ if os.path.exists(temp_file_name):
172
+ os.remove(temp_file_name)
173
+
174
+ mesh = export_mesh(mesh, ms)
175
+ return mesh
build/lib/hy3dgen/shapegen/preprocessors.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source Model Licensed under the Apache License Version 2.0
2
+ # and Other Licenses of the Third-Party Components therein:
3
+ # The below Model in this distribution may have been modified by THL A29 Limited
4
+ # ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
5
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
6
+ # The below software and/or models in this distribution may have been
7
+ # modified by THL A29 Limited ("Tencent Modifications").
8
+ # All Tencent Modifications are Copyright (C) THL A29 Limited.
9
+
10
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
11
+ # except for the third-party components listed below.
12
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
13
+ # in the repsective licenses of these third-party components.
14
+ # Users must comply with all terms and conditions of original licenses of these third-party
15
+ # components and must ensure that the usage of the third party components adheres to
16
+ # all relevant laws and regulations.
17
+
18
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
19
+ # their software and algorithms, including trained model weights, parameters (including
20
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
21
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
22
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
23
+
24
+ import cv2
25
+ import numpy as np
26
+ import torch
27
+ from PIL import Image
28
+ from einops import repeat, rearrange
29
+
30
+
31
+ def array_to_tensor(np_array):
32
+ image_pt = torch.tensor(np_array).float()
33
+ image_pt = image_pt / 255 * 2 - 1
34
+ image_pt = rearrange(image_pt, "h w c -> c h w")
35
+ image_pts = repeat(image_pt, "c h w -> b c h w", b=1)
36
+ return image_pts
37
+
38
+
39
+ class ImageProcessorV2:
40
+ def __init__(self, size=512, border_ratio=None):
41
+ self.size = size
42
+ self.border_ratio = border_ratio
43
+
44
+ @staticmethod
45
+ def recenter(image, border_ratio: float = 0.2):
46
+ """ recenter an image to leave some empty space at the image border.
47
+
48
+ Args:
49
+ image (ndarray): input image, float/uint8 [H, W, 3/4]
50
+ mask (ndarray): alpha mask, bool [H, W]
51
+ border_ratio (float, optional): border ratio, image will be resized to (1 - border_ratio). Defaults to 0.2.
52
+
53
+ Returns:
54
+ ndarray: output image, float/uint8 [H, W, 3/4]
55
+ """
56
+
57
+ if image.shape[-1] == 4:
58
+ mask = image[..., 3]
59
+ else:
60
+ mask = np.ones_like(image[..., 0:1]) * 255
61
+ image = np.concatenate([image, mask], axis=-1)
62
+ mask = mask[..., 0]
63
+
64
+ H, W, C = image.shape
65
+
66
+ size = max(H, W)
67
+ result = np.zeros((size, size, C), dtype=np.uint8)
68
+
69
+ coords = np.nonzero(mask)
70
+ x_min, x_max = coords[0].min(), coords[0].max()
71
+ y_min, y_max = coords[1].min(), coords[1].max()
72
+ h = x_max - x_min
73
+ w = y_max - y_min
74
+ if h == 0 or w == 0:
75
+ raise ValueError('input image is empty')
76
+ desired_size = int(size * (1 - border_ratio))
77
+ scale = desired_size / max(h, w)
78
+ h2 = int(h * scale)
79
+ w2 = int(w * scale)
80
+ x2_min = (size - h2) // 2
81
+ x2_max = x2_min + h2
82
+
83
+ y2_min = (size - w2) // 2
84
+ y2_max = y2_min + w2
85
+
86
+ result[x2_min:x2_max, y2_min:y2_max] = cv2.resize(image[x_min:x_max, y_min:y_max], (w2, h2),
87
+ interpolation=cv2.INTER_AREA)
88
+
89
+ bg = np.ones((result.shape[0], result.shape[1], 3), dtype=np.uint8) * 255
90
+ # bg = np.zeros((result.shape[0], result.shape[1], 3), dtype=np.uint8) * 255
91
+ mask = result[..., 3:].astype(np.float32) / 255
92
+ result = result[..., :3] * mask + bg * (1 - mask)
93
+
94
+ mask = mask * 255
95
+ result = result.clip(0, 255).astype(np.uint8)
96
+ mask = mask.clip(0, 255).astype(np.uint8)
97
+ return result, mask
98
+
99
+ def __call__(self, image, border_ratio=0.15, to_tensor=True, return_mask=False, **kwargs):
100
+ if self.border_ratio is not None:
101
+ border_ratio = self.border_ratio
102
+ print(f"Using border_ratio from init: {border_ratio}")
103
+ if isinstance(image, str):
104
+ image = cv2.imread(image, cv2.IMREAD_UNCHANGED)
105
+ image, mask = self.recenter(image, border_ratio=border_ratio)
106
+ image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
107
+ elif isinstance(image, Image.Image):
108
+ image = np.asarray(image)
109
+ image, mask = self.recenter(image, border_ratio=border_ratio)
110
+
111
+ image = cv2.resize(image, (self.size, self.size), interpolation=cv2.INTER_CUBIC)
112
+ mask = cv2.resize(mask, (self.size, self.size), interpolation=cv2.INTER_NEAREST)
113
+ mask = mask[..., np.newaxis]
114
+
115
+ if to_tensor:
116
+ image = array_to_tensor(image)
117
+ mask = array_to_tensor(mask)
118
+ if return_mask:
119
+ return image, mask
120
+ return image
121
+
122
+
123
+ IMAGE_PROCESSORS = {
124
+ "v2": ImageProcessorV2,
125
+ }
126
+
127
+ DEFAULT_IMAGEPROCESSOR = 'v2'
build/lib/hy3dgen/shapegen/schedulers.py ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Stability AI, Katherine Crowson and The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import math
16
+ from dataclasses import dataclass
17
+ from typing import List, Optional, Tuple, Union
18
+
19
+ import numpy as np
20
+ import torch
21
+ from diffusers.configuration_utils import ConfigMixin, register_to_config
22
+ from diffusers.schedulers.scheduling_utils import SchedulerMixin
23
+ from diffusers.utils import BaseOutput, logging
24
+
25
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
26
+
27
+
28
+ @dataclass
29
+ class FlowMatchEulerDiscreteSchedulerOutput(BaseOutput):
30
+ """
31
+ Output class for the scheduler's `step` function output.
32
+
33
+ Args:
34
+ prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
35
+ Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
36
+ denoising loop.
37
+ """
38
+
39
+ prev_sample: torch.FloatTensor
40
+
41
+
42
+ class FlowMatchEulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
43
+ """
44
+ NOTE: this is very similar to diffusers.FlowMatchEulerDiscreteScheduler. Except our timesteps are reversed
45
+
46
+ Euler scheduler.
47
+
48
+ This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
49
+ methods the library implements for all schedulers such as loading and saving.
50
+
51
+ Args:
52
+ num_train_timesteps (`int`, defaults to 1000):
53
+ The number of diffusion steps to train the model.
54
+ timestep_spacing (`str`, defaults to `"linspace"`):
55
+ The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
56
+ Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
57
+ shift (`float`, defaults to 1.0):
58
+ The shift value for the timestep schedule.
59
+ """
60
+
61
+ _compatibles = []
62
+ order = 1
63
+
64
+ @register_to_config
65
+ def __init__(
66
+ self,
67
+ num_train_timesteps: int = 1000,
68
+ shift: float = 1.0,
69
+ use_dynamic_shifting=False,
70
+ ):
71
+ timesteps = np.linspace(1, num_train_timesteps, num_train_timesteps, dtype=np.float32).copy()
72
+ timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32)
73
+
74
+ sigmas = timesteps / num_train_timesteps
75
+ if not use_dynamic_shifting:
76
+ # when use_dynamic_shifting is True, we apply the timestep shifting on the fly based on the image resolution
77
+ sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
78
+
79
+ self.timesteps = sigmas * num_train_timesteps
80
+
81
+ self._step_index = None
82
+ self._begin_index = None
83
+
84
+ self.sigmas = sigmas.to("cpu") # to avoid too much CPU/GPU communication
85
+ self.sigma_min = self.sigmas[-1].item()
86
+ self.sigma_max = self.sigmas[0].item()
87
+
88
+ @property
89
+ def step_index(self):
90
+ """
91
+ The index counter for current timestep. It will increase 1 after each scheduler step.
92
+ """
93
+ return self._step_index
94
+
95
+ @property
96
+ def begin_index(self):
97
+ """
98
+ The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
99
+ """
100
+ return self._begin_index
101
+
102
+ # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
103
+ def set_begin_index(self, begin_index: int = 0):
104
+ """
105
+ Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
106
+
107
+ Args:
108
+ begin_index (`int`):
109
+ The begin index for the scheduler.
110
+ """
111
+ self._begin_index = begin_index
112
+
113
+ def scale_noise(
114
+ self,
115
+ sample: torch.FloatTensor,
116
+ timestep: Union[float, torch.FloatTensor],
117
+ noise: Optional[torch.FloatTensor] = None,
118
+ ) -> torch.FloatTensor:
119
+ """
120
+ Forward process in flow-matching
121
+
122
+ Args:
123
+ sample (`torch.FloatTensor`):
124
+ The input sample.
125
+ timestep (`int`, *optional*):
126
+ The current timestep in the diffusion chain.
127
+
128
+ Returns:
129
+ `torch.FloatTensor`:
130
+ A scaled input sample.
131
+ """
132
+ # Make sure sigmas and timesteps have the same device and dtype as original_samples
133
+ sigmas = self.sigmas.to(device=sample.device, dtype=sample.dtype)
134
+
135
+ if sample.device.type == "mps" and torch.is_floating_point(timestep):
136
+ # mps does not support float64
137
+ schedule_timesteps = self.timesteps.to(sample.device, dtype=torch.float32)
138
+ timestep = timestep.to(sample.device, dtype=torch.float32)
139
+ else:
140
+ schedule_timesteps = self.timesteps.to(sample.device)
141
+ timestep = timestep.to(sample.device)
142
+
143
+ # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index
144
+ if self.begin_index is None:
145
+ step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timestep]
146
+ elif self.step_index is not None:
147
+ # add_noise is called after first denoising step (for inpainting)
148
+ step_indices = [self.step_index] * timestep.shape[0]
149
+ else:
150
+ # add noise is called before first denoising step to create initial latent(img2img)
151
+ step_indices = [self.begin_index] * timestep.shape[0]
152
+
153
+ sigma = sigmas[step_indices].flatten()
154
+ while len(sigma.shape) < len(sample.shape):
155
+ sigma = sigma.unsqueeze(-1)
156
+
157
+ sample = sigma * noise + (1.0 - sigma) * sample
158
+
159
+ return sample
160
+
161
+ def _sigma_to_t(self, sigma):
162
+ return sigma * self.config.num_train_timesteps
163
+
164
+ def time_shift(self, mu: float, sigma: float, t: torch.Tensor):
165
+ return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
166
+
167
+ def set_timesteps(
168
+ self,
169
+ num_inference_steps: int = None,
170
+ device: Union[str, torch.device] = None,
171
+ sigmas: Optional[List[float]] = None,
172
+ mu: Optional[float] = None,
173
+ ):
174
+ """
175
+ Sets the discrete timesteps used for the diffusion chain (to be run before inference).
176
+
177
+ Args:
178
+ num_inference_steps (`int`):
179
+ The number of diffusion steps used when generating samples with a pre-trained model.
180
+ device (`str` or `torch.device`, *optional*):
181
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
182
+ """
183
+
184
+ if self.config.use_dynamic_shifting and mu is None:
185
+ raise ValueError(" you have a pass a value for `mu` when `use_dynamic_shifting` is set to be `True`")
186
+
187
+ if sigmas is None:
188
+ self.num_inference_steps = num_inference_steps
189
+ timesteps = np.linspace(
190
+ self._sigma_to_t(self.sigma_max), self._sigma_to_t(self.sigma_min), num_inference_steps
191
+ )
192
+
193
+ sigmas = timesteps / self.config.num_train_timesteps
194
+
195
+ if self.config.use_dynamic_shifting:
196
+ sigmas = self.time_shift(mu, 1.0, sigmas)
197
+ else:
198
+ sigmas = self.config.shift * sigmas / (1 + (self.config.shift - 1) * sigmas)
199
+
200
+ sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device)
201
+ timesteps = sigmas * self.config.num_train_timesteps
202
+
203
+ self.timesteps = timesteps.to(device=device)
204
+ self.sigmas = torch.cat([sigmas, torch.ones(1, device=sigmas.device)])
205
+
206
+ self._step_index = None
207
+ self._begin_index = None
208
+
209
+ def index_for_timestep(self, timestep, schedule_timesteps=None):
210
+ if schedule_timesteps is None:
211
+ schedule_timesteps = self.timesteps
212
+
213
+ indices = (schedule_timesteps == timestep).nonzero()
214
+
215
+ # The sigma index that is taken for the **very** first `step`
216
+ # is always the second index (or the last index if there is only 1)
217
+ # This way we can ensure we don't accidentally skip a sigma in
218
+ # case we start in the middle of the denoising schedule (e.g. for image-to-image)
219
+ pos = 1 if len(indices) > 1 else 0
220
+
221
+ return indices[pos].item()
222
+
223
+ def _init_step_index(self, timestep):
224
+ if self.begin_index is None:
225
+ if isinstance(timestep, torch.Tensor):
226
+ timestep = timestep.to(self.timesteps.device)
227
+ self._step_index = self.index_for_timestep(timestep)
228
+ else:
229
+ self._step_index = self._begin_index
230
+
231
+ def step(
232
+ self,
233
+ model_output: torch.FloatTensor,
234
+ timestep: Union[float, torch.FloatTensor],
235
+ sample: torch.FloatTensor,
236
+ s_churn: float = 0.0,
237
+ s_tmin: float = 0.0,
238
+ s_tmax: float = float("inf"),
239
+ s_noise: float = 1.0,
240
+ generator: Optional[torch.Generator] = None,
241
+ return_dict: bool = True,
242
+ ) -> Union[FlowMatchEulerDiscreteSchedulerOutput, Tuple]:
243
+ """
244
+ Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
245
+ process from the learned model outputs (most often the predicted noise).
246
+
247
+ Args:
248
+ model_output (`torch.FloatTensor`):
249
+ The direct output from learned diffusion model.
250
+ timestep (`float`):
251
+ The current discrete timestep in the diffusion chain.
252
+ sample (`torch.FloatTensor`):
253
+ A current instance of a sample created by the diffusion process.
254
+ s_churn (`float`):
255
+ s_tmin (`float`):
256
+ s_tmax (`float`):
257
+ s_noise (`float`, defaults to 1.0):
258
+ Scaling factor for noise added to the sample.
259
+ generator (`torch.Generator`, *optional*):
260
+ A random number generator.
261
+ return_dict (`bool`):
262
+ Whether or not to return a [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or
263
+ tuple.
264
+
265
+ Returns:
266
+ [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or `tuple`:
267
+ If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is
268
+ returned, otherwise a tuple is returned where the first element is the sample tensor.
269
+ """
270
+
271
+ if (
272
+ isinstance(timestep, int)
273
+ or isinstance(timestep, torch.IntTensor)
274
+ or isinstance(timestep, torch.LongTensor)
275
+ ):
276
+ raise ValueError(
277
+ (
278
+ "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
279
+ " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
280
+ " one of the `scheduler.timesteps` as a timestep."
281
+ ),
282
+ )
283
+
284
+ if self.step_index is None:
285
+ self._init_step_index(timestep)
286
+
287
+ # Upcast to avoid precision issues when computing prev_sample
288
+ sample = sample.to(torch.float32)
289
+
290
+ sigma = self.sigmas[self.step_index]
291
+ sigma_next = self.sigmas[self.step_index + 1]
292
+
293
+ prev_sample = sample + (sigma_next - sigma) * model_output
294
+
295
+ # Cast sample back to model compatible dtype
296
+ prev_sample = prev_sample.to(model_output.dtype)
297
+
298
+ # upon completion increase step index by one
299
+ self._step_index += 1
300
+
301
+ if not return_dict:
302
+ return (prev_sample,)
303
+
304
+ return FlowMatchEulerDiscreteSchedulerOutput(prev_sample=prev_sample)
305
+
306
+ def __len__(self):
307
+ return self.config.num_train_timesteps
build/lib/hy3dgen/texgen/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source Model Licensed under the Apache License Version 2.0
2
+ # and Other Licenses of the Third-Party Components therein:
3
+ # The below Model in this distribution may have been modified by THL A29 Limited
4
+ # ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
5
+
6
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
7
+ # The below software and/or models in this distribution may have been
8
+ # modified by THL A29 Limited ("Tencent Modifications").
9
+ # All Tencent Modifications are Copyright (C) THL A29 Limited.
10
+
11
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
12
+ # except for the third-party components listed below.
13
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
14
+ # in the repsective licenses of these third-party components.
15
+ # Users must comply with all terms and conditions of original licenses of these third-party
16
+ # components and must ensure that the usage of the third party components adheres to
17
+ # all relevant laws and regulations.
18
+
19
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
20
+ # their software and algorithms, including trained model weights, parameters (including
21
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
22
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
23
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
24
+
25
+
26
+ from .pipelines import Hunyuan3DPaintPipeline, Hunyuan3DTexGenConfig
build/lib/hy3dgen/texgen/differentiable_renderer/__init__.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source Model Licensed under the Apache License Version 2.0
2
+ # and Other Licenses of the Third-Party Components therein:
3
+ # The below Model in this distribution may have been modified by THL A29 Limited
4
+ # ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
5
+
6
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
7
+ # The below software and/or models in this distribution may have been
8
+ # modified by THL A29 Limited ("Tencent Modifications").
9
+ # All Tencent Modifications are Copyright (C) THL A29 Limited.
10
+
11
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
12
+ # except for the third-party components listed below.
13
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
14
+ # in the repsective licenses of these third-party components.
15
+ # Users must comply with all terms and conditions of original licenses of these third-party
16
+ # components and must ensure that the usage of the third party components adheres to
17
+ # all relevant laws and regulations.
18
+
19
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
20
+ # their software and algorithms, including trained model weights, parameters (including
21
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
22
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
23
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
build/lib/hy3dgen/texgen/differentiable_renderer/camera_utils.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source Model Licensed under the Apache License Version 2.0
2
+ # and Other Licenses of the Third-Party Components therein:
3
+ # The below Model in this distribution may have been modified by THL A29 Limited
4
+ # ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
5
+
6
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
7
+ # The below software and/or models in this distribution may have been
8
+ # modified by THL A29 Limited ("Tencent Modifications").
9
+ # All Tencent Modifications are Copyright (C) THL A29 Limited.
10
+
11
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
12
+ # except for the third-party components listed below.
13
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
14
+ # in the repsective licenses of these third-party components.
15
+ # Users must comply with all terms and conditions of original licenses of these third-party
16
+ # components and must ensure that the usage of the third party components adheres to
17
+ # all relevant laws and regulations.
18
+
19
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
20
+ # their software and algorithms, including trained model weights, parameters (including
21
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
22
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
23
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
24
+
25
+ import math
26
+
27
+ import numpy as np
28
+ import torch
29
+
30
+
31
+ def transform_pos(mtx, pos, keepdim=False):
32
+ t_mtx = torch.from_numpy(mtx).to(
33
+ pos.device) if isinstance(
34
+ mtx, np.ndarray) else mtx
35
+ if pos.shape[-1] == 3:
36
+ posw = torch.cat(
37
+ [pos, torch.ones([pos.shape[0], 1]).to(pos.device)], axis=1)
38
+ else:
39
+ posw = pos
40
+
41
+ if keepdim:
42
+ return torch.matmul(posw, t_mtx.t())[...]
43
+ else:
44
+ return torch.matmul(posw, t_mtx.t())[None, ...]
45
+
46
+
47
+ def get_mv_matrix(elev, azim, camera_distance, center=None):
48
+ elev = -elev
49
+ azim += 90
50
+
51
+ elev_rad = math.radians(elev)
52
+ azim_rad = math.radians(azim)
53
+
54
+ camera_position = np.array([camera_distance * math.cos(elev_rad) * math.cos(azim_rad),
55
+ camera_distance *
56
+ math.cos(elev_rad) * math.sin(azim_rad),
57
+ camera_distance * math.sin(elev_rad)])
58
+
59
+ if center is None:
60
+ center = np.array([0, 0, 0])
61
+ else:
62
+ center = np.array(center)
63
+
64
+ lookat = center - camera_position
65
+ lookat = lookat / np.linalg.norm(lookat)
66
+
67
+ up = np.array([0, 0, 1.0])
68
+ right = np.cross(lookat, up)
69
+ right = right / np.linalg.norm(right)
70
+ up = np.cross(right, lookat)
71
+ up = up / np.linalg.norm(up)
72
+
73
+ c2w = np.concatenate(
74
+ [np.stack([right, up, -lookat], axis=-1), camera_position[:, None]], axis=-1)
75
+
76
+ w2c = np.zeros((4, 4))
77
+ w2c[:3, :3] = np.transpose(c2w[:3, :3], (1, 0))
78
+ w2c[:3, 3:] = -np.matmul(np.transpose(c2w[:3, :3], (1, 0)), c2w[:3, 3:])
79
+ w2c[3, 3] = 1.0
80
+
81
+ return w2c.astype(np.float32)
82
+
83
+
84
+ def get_orthographic_projection_matrix(
85
+ left=-1, right=1, bottom=-1, top=1, near=0, far=2):
86
+ """
87
+ 计算正交投影矩阵。
88
+
89
+ 参数:
90
+ left (float): 投影区域左侧边界。
91
+ right (float): 投影区域右侧边界。
92
+ bottom (float): 投影区域底部边界。
93
+ top (float): 投影区域顶部边界。
94
+ near (float): 投影区域近裁剪面距离。
95
+ far (float): 投影区域远裁剪面距离。
96
+
97
+ 返回:
98
+ numpy.ndarray: 正交投影矩阵。
99
+ """
100
+ ortho_matrix = np.eye(4, dtype=np.float32)
101
+ ortho_matrix[0, 0] = 2 / (right - left)
102
+ ortho_matrix[1, 1] = 2 / (top - bottom)
103
+ ortho_matrix[2, 2] = -2 / (far - near)
104
+ ortho_matrix[0, 3] = -(right + left) / (right - left)
105
+ ortho_matrix[1, 3] = -(top + bottom) / (top - bottom)
106
+ ortho_matrix[2, 3] = -(far + near) / (far - near)
107
+ return ortho_matrix
108
+
109
+
110
+ def get_perspective_projection_matrix(fovy, aspect_wh, near, far):
111
+ fovy_rad = math.radians(fovy)
112
+ return np.array([[1.0 / (math.tan(fovy_rad / 2.0) * aspect_wh), 0, 0, 0],
113
+ [0, 1.0 / math.tan(fovy_rad / 2.0), 0, 0],
114
+ [0, 0, -(far + near) / (far - near), -
115
+ 2.0 * far * near / (far - near)],
116
+ [0, 0, -1, 0]]).astype(np.float32)
build/lib/hy3dgen/texgen/differentiable_renderer/mesh_processor.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ def meshVerticeInpaint_smooth(texture, mask, vtx_pos, vtx_uv, pos_idx, uv_idx):
4
+ texture_height, texture_width, texture_channel = texture.shape
5
+ vtx_num = vtx_pos.shape[0]
6
+
7
+ vtx_mask = np.zeros(vtx_num, dtype=np.float32)
8
+ vtx_color = [np.zeros(texture_channel, dtype=np.float32) for _ in range(vtx_num)]
9
+ uncolored_vtxs = []
10
+ G = [[] for _ in range(vtx_num)]
11
+
12
+ for i in range(uv_idx.shape[0]):
13
+ for k in range(3):
14
+ vtx_uv_idx = uv_idx[i, k]
15
+ vtx_idx = pos_idx[i, k]
16
+ uv_v = int(round(vtx_uv[vtx_uv_idx, 0] * (texture_width - 1)))
17
+ uv_u = int(round((1.0 - vtx_uv[vtx_uv_idx, 1]) * (texture_height - 1)))
18
+ if mask[uv_u, uv_v] > 0:
19
+ vtx_mask[vtx_idx] = 1.0
20
+ vtx_color[vtx_idx] = texture[uv_u, uv_v]
21
+ else:
22
+ uncolored_vtxs.append(vtx_idx)
23
+ G[pos_idx[i, k]].append(pos_idx[i, (k + 1) % 3])
24
+
25
+ smooth_count = 2
26
+ last_uncolored_vtx_count = 0
27
+ while smooth_count > 0:
28
+ uncolored_vtx_count = 0
29
+ for vtx_idx in uncolored_vtxs:
30
+ sum_color = np.zeros(texture_channel, dtype=np.float32)
31
+ total_weight = 0.0
32
+ vtx_0 = vtx_pos[vtx_idx]
33
+ for connected_idx in G[vtx_idx]:
34
+ if vtx_mask[connected_idx] > 0:
35
+ vtx1 = vtx_pos[connected_idx]
36
+ dist = np.sqrt(np.sum((vtx_0 - vtx1) ** 2))
37
+ dist_weight = 1.0 / max(dist, 1e-4)
38
+ dist_weight *= dist_weight
39
+ sum_color += vtx_color[connected_idx] * dist_weight
40
+ total_weight += dist_weight
41
+ if total_weight > 0:
42
+ vtx_color[vtx_idx] = sum_color / total_weight
43
+ vtx_mask[vtx_idx] = 1.0
44
+ else:
45
+ uncolored_vtx_count += 1
46
+
47
+ if last_uncolored_vtx_count == uncolored_vtx_count:
48
+ smooth_count -= 1
49
+ else:
50
+ smooth_count += 1
51
+ last_uncolored_vtx_count = uncolored_vtx_count
52
+
53
+ new_texture = texture.copy()
54
+ new_mask = mask.copy()
55
+ for face_idx in range(uv_idx.shape[0]):
56
+ for k in range(3):
57
+ vtx_uv_idx = uv_idx[face_idx, k]
58
+ vtx_idx = pos_idx[face_idx, k]
59
+ if vtx_mask[vtx_idx] == 1.0:
60
+ uv_v = int(round(vtx_uv[vtx_uv_idx, 0] * (texture_width - 1)))
61
+ uv_u = int(round((1.0 - vtx_uv[vtx_uv_idx, 1]) * (texture_height - 1)))
62
+ new_texture[uv_u, uv_v] = vtx_color[vtx_idx]
63
+ new_mask[uv_u, uv_v] = 255
64
+ return new_texture, new_mask
65
+
66
+ def meshVerticeInpaint(texture, mask, vtx_pos, vtx_uv, pos_idx, uv_idx, method="smooth"):
67
+ if method == "smooth":
68
+ return meshVerticeInpaint_smooth(texture, mask, vtx_pos, vtx_uv, pos_idx, uv_idx)
69
+ else:
70
+ raise ValueError("Invalid method. Use 'smooth' or 'forward'.")
build/lib/hy3dgen/texgen/differentiable_renderer/mesh_render.py ADDED
@@ -0,0 +1,833 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source Model Licensed under the Apache License Version 2.0
2
+ # and Other Licenses of the Third-Party Components therein:
3
+ # The below Model in this distribution may have been modified by THL A29 Limited
4
+ # ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
5
+
6
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
7
+ # The below software and/or models in this distribution may have been
8
+ # modified by THL A29 Limited ("Tencent Modifications").
9
+ # All Tencent Modifications are Copyright (C) THL A29 Limited.
10
+
11
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
12
+ # except for the third-party components listed below.
13
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
14
+ # in the repsective licenses of these third-party components.
15
+ # Users must comply with all terms and conditions of original licenses of these third-party
16
+ # components and must ensure that the usage of the third party components adheres to
17
+ # all relevant laws and regulations.
18
+
19
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
20
+ # their software and algorithms, including trained model weights, parameters (including
21
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
22
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
23
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
24
+
25
+ import cv2
26
+ import numpy as np
27
+ import torch
28
+ import torch.nn.functional as F
29
+ import trimesh
30
+ from PIL import Image
31
+
32
+ from .camera_utils import (
33
+ transform_pos,
34
+ get_mv_matrix,
35
+ get_orthographic_projection_matrix,
36
+ get_perspective_projection_matrix,
37
+ )
38
+ from .mesh_processor import meshVerticeInpaint
39
+ from .mesh_utils import load_mesh, save_mesh
40
+
41
+
42
+ def stride_from_shape(shape):
43
+ stride = [1]
44
+ for x in reversed(shape[1:]):
45
+ stride.append(stride[-1] * x)
46
+ return list(reversed(stride))
47
+
48
+
49
+ def scatter_add_nd_with_count(input, count, indices, values, weights=None):
50
+ # input: [..., C], D dimension + C channel
51
+ # count: [..., 1], D dimension
52
+ # indices: [N, D], long
53
+ # values: [N, C]
54
+
55
+ D = indices.shape[-1]
56
+ C = input.shape[-1]
57
+ size = input.shape[:-1]
58
+ stride = stride_from_shape(size)
59
+
60
+ assert len(size) == D
61
+
62
+ input = input.view(-1, C) # [HW, C]
63
+ count = count.view(-1, 1)
64
+
65
+ flatten_indices = (indices * torch.tensor(stride,
66
+ dtype=torch.long, device=indices.device)).sum(-1) # [N]
67
+
68
+ if weights is None:
69
+ weights = torch.ones_like(values[..., :1])
70
+
71
+ input.scatter_add_(0, flatten_indices.unsqueeze(1).repeat(1, C), values)
72
+ count.scatter_add_(0, flatten_indices.unsqueeze(1), weights)
73
+
74
+ return input.view(*size, C), count.view(*size, 1)
75
+
76
+
77
+ def linear_grid_put_2d(H, W, coords, values, return_count=False):
78
+ # coords: [N, 2], float in [0, 1]
79
+ # values: [N, C]
80
+
81
+ C = values.shape[-1]
82
+
83
+ indices = coords * torch.tensor(
84
+ [H - 1, W - 1], dtype=torch.float32, device=coords.device
85
+ )
86
+ indices_00 = indices.floor().long() # [N, 2]
87
+ indices_00[:, 0].clamp_(0, H - 2)
88
+ indices_00[:, 1].clamp_(0, W - 2)
89
+ indices_01 = indices_00 + torch.tensor(
90
+ [0, 1], dtype=torch.long, device=indices.device
91
+ )
92
+ indices_10 = indices_00 + torch.tensor(
93
+ [1, 0], dtype=torch.long, device=indices.device
94
+ )
95
+ indices_11 = indices_00 + torch.tensor(
96
+ [1, 1], dtype=torch.long, device=indices.device
97
+ )
98
+
99
+ h = indices[..., 0] - indices_00[..., 0].float()
100
+ w = indices[..., 1] - indices_00[..., 1].float()
101
+ w_00 = (1 - h) * (1 - w)
102
+ w_01 = (1 - h) * w
103
+ w_10 = h * (1 - w)
104
+ w_11 = h * w
105
+
106
+ result = torch.zeros(H, W, C, device=values.device,
107
+ dtype=values.dtype) # [H, W, C]
108
+ count = torch.zeros(H, W, 1, device=values.device,
109
+ dtype=values.dtype) # [H, W, 1]
110
+ weights = torch.ones_like(values[..., :1]) # [N, 1]
111
+
112
+ result, count = scatter_add_nd_with_count(
113
+ result, count, indices_00, values * w_00.unsqueeze(1), weights * w_00.unsqueeze(1))
114
+ result, count = scatter_add_nd_with_count(
115
+ result, count, indices_01, values * w_01.unsqueeze(1), weights * w_01.unsqueeze(1))
116
+ result, count = scatter_add_nd_with_count(
117
+ result, count, indices_10, values * w_10.unsqueeze(1), weights * w_10.unsqueeze(1))
118
+ result, count = scatter_add_nd_with_count(
119
+ result, count, indices_11, values * w_11.unsqueeze(1), weights * w_11.unsqueeze(1))
120
+
121
+ if return_count:
122
+ return result, count
123
+
124
+ mask = (count.squeeze(-1) > 0)
125
+ result[mask] = result[mask] / count[mask].repeat(1, C)
126
+
127
+ return result
128
+
129
+
130
+ class MeshRender():
131
+ def __init__(
132
+ self,
133
+ camera_distance=1.45, camera_type='orth',
134
+ default_resolution=1024, texture_size=1024,
135
+ use_antialias=True, max_mip_level=None, filter_mode='linear',
136
+ bake_mode='linear', raster_mode='cr', device='cuda'):
137
+
138
+ self.device = device
139
+
140
+ self.set_default_render_resolution(default_resolution)
141
+ self.set_default_texture_resolution(texture_size)
142
+
143
+ self.camera_distance = camera_distance
144
+ self.use_antialias = use_antialias
145
+ self.max_mip_level = max_mip_level
146
+ self.filter_mode = filter_mode
147
+
148
+ self.bake_angle_thres = 75
149
+ self.bake_unreliable_kernel_size = int(
150
+ (2 / 512) * max(self.default_resolution[0], self.default_resolution[1]))
151
+ self.bake_mode = bake_mode
152
+
153
+ self.raster_mode = raster_mode
154
+ if self.raster_mode == 'cr':
155
+ import custom_rasterizer as cr
156
+ self.raster = cr
157
+ else:
158
+ raise f'No raster named {self.raster_mode}'
159
+
160
+ if camera_type == 'orth':
161
+ self.ortho_scale = 1.2
162
+ self.camera_proj_mat = get_orthographic_projection_matrix(
163
+ left=-self.ortho_scale * 0.5, right=self.ortho_scale * 0.5,
164
+ bottom=-self.ortho_scale * 0.5, top=self.ortho_scale * 0.5,
165
+ near=0.1, far=100
166
+ )
167
+ elif camera_type == 'perspective':
168
+ self.camera_proj_mat = get_perspective_projection_matrix(
169
+ 49.13, self.default_resolution[1] / self.default_resolution[0],
170
+ 0.01, 100.0
171
+ )
172
+ else:
173
+ raise f'No camera type {camera_type}'
174
+
175
+ def raster_rasterize(self, pos, tri, resolution, ranges=None, grad_db=True):
176
+
177
+ if self.raster_mode == 'cr':
178
+ rast_out_db = None
179
+ if pos.dim() == 2:
180
+ pos = pos.unsqueeze(0)
181
+ findices, barycentric = self.raster.rasterize(pos, tri, resolution)
182
+ rast_out = torch.cat((barycentric, findices.unsqueeze(-1)), dim=-1)
183
+ rast_out = rast_out.unsqueeze(0)
184
+ else:
185
+ raise f'No raster named {self.raster_mode}'
186
+
187
+ return rast_out, rast_out_db
188
+
189
+ def raster_interpolate(self, uv, rast_out, uv_idx, rast_db=None, diff_attrs=None):
190
+
191
+ if self.raster_mode == 'cr':
192
+ textd = None
193
+ barycentric = rast_out[0, ..., :-1]
194
+ findices = rast_out[0, ..., -1]
195
+ if uv.dim() == 2:
196
+ uv = uv.unsqueeze(0)
197
+ textc = self.raster.interpolate(uv, findices, barycentric, uv_idx)
198
+ else:
199
+ raise f'No raster named {self.raster_mode}'
200
+
201
+ return textc, textd
202
+
203
+ def raster_texture(self, tex, uv, uv_da=None, mip_level_bias=None, mip=None, filter_mode='auto',
204
+ boundary_mode='wrap', max_mip_level=None):
205
+
206
+ if self.raster_mode == 'cr':
207
+ raise f'Texture is not implemented in cr'
208
+ else:
209
+ raise f'No raster named {self.raster_mode}'
210
+
211
+ return color
212
+
213
+ def raster_antialias(self, color, rast, pos, tri, topology_hash=None, pos_gradient_boost=1.0):
214
+
215
+ if self.raster_mode == 'cr':
216
+ # Antialias has not been supported yet
217
+ color = color
218
+ else:
219
+ raise f'No raster named {self.raster_mode}'
220
+
221
+ return color
222
+
223
+ def load_mesh(
224
+ self,
225
+ mesh,
226
+ scale_factor=1.15,
227
+ auto_center=True,
228
+ ):
229
+ vtx_pos, pos_idx, vtx_uv, uv_idx, texture_data = load_mesh(mesh)
230
+ self.mesh_copy = mesh
231
+ self.set_mesh(vtx_pos, pos_idx,
232
+ vtx_uv=vtx_uv, uv_idx=uv_idx,
233
+ scale_factor=scale_factor, auto_center=auto_center
234
+ )
235
+ if texture_data is not None:
236
+ self.set_texture(texture_data)
237
+
238
+ def save_mesh(self):
239
+ texture_data = self.get_texture()
240
+ texture_data = Image.fromarray((texture_data * 255).astype(np.uint8))
241
+ return save_mesh(self.mesh_copy, texture_data)
242
+
243
+ def set_mesh(
244
+ self,
245
+ vtx_pos, pos_idx,
246
+ vtx_uv=None, uv_idx=None,
247
+ scale_factor=1.15, auto_center=True
248
+ ):
249
+
250
+ self.vtx_pos = torch.from_numpy(vtx_pos).to(self.device).float()
251
+ self.pos_idx = torch.from_numpy(pos_idx).to(self.device).to(torch.int)
252
+ if (vtx_uv is not None) and (uv_idx is not None):
253
+ self.vtx_uv = torch.from_numpy(vtx_uv).to(self.device).float()
254
+ self.uv_idx = torch.from_numpy(uv_idx).to(self.device).to(torch.int)
255
+ else:
256
+ self.vtx_uv = None
257
+ self.uv_idx = None
258
+
259
+ self.vtx_pos[:, [0, 1]] = -self.vtx_pos[:, [0, 1]]
260
+ self.vtx_pos[:, [1, 2]] = self.vtx_pos[:, [2, 1]]
261
+ if (vtx_uv is not None) and (uv_idx is not None):
262
+ self.vtx_uv[:, 1] = 1.0 - self.vtx_uv[:, 1]
263
+
264
+ if auto_center:
265
+ max_bb = (self.vtx_pos - 0).max(0)[0]
266
+ min_bb = (self.vtx_pos - 0).min(0)[0]
267
+ center = (max_bb + min_bb) / 2
268
+ scale = torch.norm(self.vtx_pos - center, dim=1).max() * 2.0
269
+ self.vtx_pos = (self.vtx_pos - center) * \
270
+ (scale_factor / float(scale))
271
+ self.scale_factor = scale_factor
272
+
273
+ def set_texture(self, tex):
274
+ if isinstance(tex, np.ndarray):
275
+ tex = Image.fromarray((tex * 255).astype(np.uint8))
276
+ elif isinstance(tex, torch.Tensor):
277
+ tex = tex.cpu().numpy()
278
+ tex = Image.fromarray((tex * 255).astype(np.uint8))
279
+
280
+ tex = tex.resize(self.texture_size).convert('RGB')
281
+ tex = np.array(tex) / 255.0
282
+ self.tex = torch.from_numpy(tex).to(self.device)
283
+ self.tex = self.tex.float()
284
+
285
+ def set_default_render_resolution(self, default_resolution):
286
+ if isinstance(default_resolution, int):
287
+ default_resolution = (default_resolution, default_resolution)
288
+ self.default_resolution = default_resolution
289
+
290
+ def set_default_texture_resolution(self, texture_size):
291
+ if isinstance(texture_size, int):
292
+ texture_size = (texture_size, texture_size)
293
+ self.texture_size = texture_size
294
+
295
+ def get_mesh(self):
296
+ vtx_pos = self.vtx_pos.cpu().numpy()
297
+ pos_idx = self.pos_idx.cpu().numpy()
298
+ vtx_uv = self.vtx_uv.cpu().numpy()
299
+ uv_idx = self.uv_idx.cpu().numpy()
300
+
301
+ # 坐标变换的逆变换
302
+ vtx_pos[:, [1, 2]] = vtx_pos[:, [2, 1]]
303
+ vtx_pos[:, [0, 1]] = -vtx_pos[:, [0, 1]]
304
+
305
+ vtx_uv[:, 1] = 1.0 - vtx_uv[:, 1]
306
+ return vtx_pos, pos_idx, vtx_uv, uv_idx
307
+
308
+ def get_texture(self):
309
+ return self.tex.cpu().numpy()
310
+
311
+ def to(self, device):
312
+ self.device = device
313
+
314
+ for attr_name in dir(self):
315
+ attr_value = getattr(self, attr_name)
316
+ if isinstance(attr_value, torch.Tensor):
317
+ setattr(self, attr_name, attr_value.to(self.device))
318
+
319
+ def color_rgb_to_srgb(self, image):
320
+ if isinstance(image, Image.Image):
321
+ image_rgb = torch.tesnor(
322
+ np.array(image) /
323
+ 255.0).float().to(
324
+ self.device)
325
+ elif isinstance(image, np.ndarray):
326
+ image_rgb = torch.tensor(image).float()
327
+ else:
328
+ image_rgb = image.to(self.device)
329
+
330
+ image_srgb = torch.where(
331
+ image_rgb <= 0.0031308,
332
+ 12.92 * image_rgb,
333
+ 1.055 * torch.pow(image_rgb, 1 / 2.4) - 0.055
334
+ )
335
+
336
+ if isinstance(image, Image.Image):
337
+ image_srgb = Image.fromarray(
338
+ (image_srgb.cpu().numpy() *
339
+ 255).astype(
340
+ np.uint8))
341
+ elif isinstance(image, np.ndarray):
342
+ image_srgb = image_srgb.cpu().numpy()
343
+ else:
344
+ image_srgb = image_srgb.to(image.device)
345
+
346
+ return image_srgb
347
+
348
+ def _render(
349
+ self,
350
+ glctx,
351
+ mvp,
352
+ pos,
353
+ pos_idx,
354
+ uv,
355
+ uv_idx,
356
+ tex,
357
+ resolution,
358
+ max_mip_level,
359
+ keep_alpha,
360
+ filter_mode
361
+ ):
362
+ pos_clip = transform_pos(mvp, pos)
363
+ if isinstance(resolution, (int, float)):
364
+ resolution = [resolution, resolution]
365
+ rast_out, rast_out_db = self.raster_rasterize(
366
+ glctx, pos_clip, pos_idx, resolution=resolution)
367
+
368
+ tex = tex.contiguous()
369
+ if filter_mode == 'linear-mipmap-linear':
370
+ texc, texd = self.raster_interpolate(
371
+ uv[None, ...], rast_out, uv_idx, rast_db=rast_out_db, diff_attrs='all')
372
+ color = self.raster_texture(
373
+ tex[None, ...], texc, texd, filter_mode='linear-mipmap-linear', max_mip_level=max_mip_level)
374
+ else:
375
+ texc, _ = self.raster_interpolate(uv[None, ...], rast_out, uv_idx)
376
+ color = self.raster_texture(tex[None, ...], texc, filter_mode=filter_mode)
377
+
378
+ visible_mask = torch.clamp(rast_out[..., -1:], 0, 1)
379
+ color = color * visible_mask # Mask out background.
380
+ if self.use_antialias:
381
+ color = self.raster_antialias(color, rast_out, pos_clip, pos_idx)
382
+
383
+ if keep_alpha:
384
+ color = torch.cat([color, visible_mask], dim=-1)
385
+ return color[0, ...]
386
+
387
+ def render(
388
+ self,
389
+ elev,
390
+ azim,
391
+ camera_distance=None,
392
+ center=None,
393
+ resolution=None,
394
+ tex=None,
395
+ keep_alpha=True,
396
+ bgcolor=None,
397
+ filter_mode=None,
398
+ return_type='th'
399
+ ):
400
+
401
+ proj = self.camera_proj_mat
402
+ r_mv = get_mv_matrix(
403
+ elev=elev,
404
+ azim=azim,
405
+ camera_distance=self.camera_distance if camera_distance is None else camera_distance,
406
+ center=center)
407
+ r_mvp = np.matmul(proj, r_mv).astype(np.float32)
408
+ if tex is not None:
409
+ if isinstance(tex, Image.Image):
410
+ tex = torch.tensor(np.array(tex) / 255.0)
411
+ elif isinstance(tex, np.ndarray):
412
+ tex = torch.tensor(tex)
413
+ if tex.dim() == 2:
414
+ tex = tex.unsqueeze(-1)
415
+ tex = tex.float().to(self.device)
416
+ image = self._render(r_mvp, self.vtx_pos, self.pos_idx, self.vtx_uv, self.uv_idx,
417
+ self.tex if tex is None else tex,
418
+ self.default_resolution if resolution is None else resolution,
419
+ self.max_mip_level, True, filter_mode if filter_mode else self.filter_mode)
420
+ mask = (image[..., [-1]] == 1).float()
421
+ if bgcolor is None:
422
+ bgcolor = [0 for _ in range(image.shape[-1] - 1)]
423
+ image = image * mask + (1 - mask) * \
424
+ torch.tensor(bgcolor + [0]).to(self.device)
425
+ if keep_alpha == False:
426
+ image = image[..., :-1]
427
+ if return_type == 'np':
428
+ image = image.cpu().numpy()
429
+ elif return_type == 'pl':
430
+ image = image.squeeze(-1).cpu().numpy() * 255
431
+ image = Image.fromarray(image.astype(np.uint8))
432
+ return image
433
+
434
+ def render_normal(
435
+ self,
436
+ elev,
437
+ azim,
438
+ camera_distance=None,
439
+ center=None,
440
+ resolution=None,
441
+ bg_color=[1, 1, 1],
442
+ use_abs_coor=False,
443
+ normalize_rgb=True,
444
+ return_type='th'
445
+ ):
446
+
447
+ pos_camera, pos_clip = self.get_pos_from_mvp(elev, azim, camera_distance, center)
448
+ if resolution is None:
449
+ resolution = self.default_resolution
450
+ if isinstance(resolution, (int, float)):
451
+ resolution = [resolution, resolution]
452
+ rast_out, rast_out_db = self.raster_rasterize(
453
+ pos_clip, self.pos_idx, resolution=resolution)
454
+
455
+ if use_abs_coor:
456
+ mesh_triangles = self.vtx_pos[self.pos_idx[:, :3], :]
457
+ else:
458
+ pos_camera = pos_camera[:, :3] / pos_camera[:, 3:4]
459
+ mesh_triangles = pos_camera[self.pos_idx[:, :3], :]
460
+ face_normals = F.normalize(
461
+ torch.cross(mesh_triangles[:,
462
+ 1,
463
+ :] - mesh_triangles[:,
464
+ 0,
465
+ :],
466
+ mesh_triangles[:,
467
+ 2,
468
+ :] - mesh_triangles[:,
469
+ 0,
470
+ :],
471
+ dim=-1),
472
+ dim=-1)
473
+
474
+ vertex_normals = trimesh.geometry.mean_vertex_normals(vertex_count=self.vtx_pos.shape[0],
475
+ faces=self.pos_idx.cpu(),
476
+ face_normals=face_normals.cpu(), )
477
+ vertex_normals = torch.from_numpy(
478
+ vertex_normals).float().to(self.device).contiguous()
479
+
480
+ # Interpolate normal values across the rasterized pixels
481
+ normal, _ = self.raster_interpolate(
482
+ vertex_normals[None, ...], rast_out, self.pos_idx)
483
+
484
+ visible_mask = torch.clamp(rast_out[..., -1:], 0, 1)
485
+ normal = normal * visible_mask + \
486
+ torch.tensor(bg_color, dtype=torch.float32, device=self.device) * (1 -
487
+ visible_mask) # Mask out background.
488
+
489
+ if normalize_rgb:
490
+ normal = (normal + 1) * 0.5
491
+ if self.use_antialias:
492
+ normal = self.raster_antialias(normal, rast_out, pos_clip, self.pos_idx)
493
+
494
+ image = normal[0, ...]
495
+ if return_type == 'np':
496
+ image = image.cpu().numpy()
497
+ elif return_type == 'pl':
498
+ image = image.cpu().numpy() * 255
499
+ image = Image.fromarray(image.astype(np.uint8))
500
+
501
+ return image
502
+
503
+ def convert_normal_map(self, image):
504
+ # blue is front, red is left, green is top
505
+ if isinstance(image, Image.Image):
506
+ image = np.array(image)
507
+ mask = (image == [255, 255, 255]).all(axis=-1)
508
+
509
+ image = (image / 255.0) * 2.0 - 1.0
510
+
511
+ image[..., [1]] = -image[..., [1]]
512
+ image[..., [1, 2]] = image[..., [2, 1]]
513
+ image[..., [0]] = -image[..., [0]]
514
+
515
+ image = (image + 1.0) * 0.5
516
+
517
+ image = (image * 255).astype(np.uint8)
518
+ image[mask] = [127, 127, 255]
519
+
520
+ return Image.fromarray(image)
521
+
522
+ def get_pos_from_mvp(self, elev, azim, camera_distance, center):
523
+ proj = self.camera_proj_mat
524
+ r_mv = get_mv_matrix(
525
+ elev=elev,
526
+ azim=azim,
527
+ camera_distance=self.camera_distance if camera_distance is None else camera_distance,
528
+ center=center)
529
+
530
+ pos_camera = transform_pos(r_mv, self.vtx_pos, keepdim=True)
531
+ pos_clip = transform_pos(proj, pos_camera)
532
+
533
+ return pos_camera, pos_clip
534
+
535
+ def render_depth(
536
+ self,
537
+ elev,
538
+ azim,
539
+ camera_distance=None,
540
+ center=None,
541
+ resolution=None,
542
+ return_type='th'
543
+ ):
544
+ pos_camera, pos_clip = self.get_pos_from_mvp(elev, azim, camera_distance, center)
545
+
546
+ if resolution is None:
547
+ resolution = self.default_resolution
548
+ if isinstance(resolution, (int, float)):
549
+ resolution = [resolution, resolution]
550
+ rast_out, rast_out_db = self.raster_rasterize(
551
+ pos_clip, self.pos_idx, resolution=resolution)
552
+
553
+ pos_camera = pos_camera[:, :3] / pos_camera[:, 3:4]
554
+ tex_depth = pos_camera[:, 2].reshape(1, -1, 1).contiguous()
555
+
556
+ # Interpolate depth values across the rasterized pixels
557
+ depth, _ = self.raster_interpolate(tex_depth, rast_out, self.pos_idx)
558
+
559
+ visible_mask = torch.clamp(rast_out[..., -1:], 0, 1)
560
+ depth_max, depth_min = depth[visible_mask >
561
+ 0].max(), depth[visible_mask > 0].min()
562
+ depth = (depth - depth_min) / (depth_max - depth_min)
563
+
564
+ depth = depth * visible_mask # Mask out background.
565
+ if self.use_antialias:
566
+ depth = self.raster_antialias(depth, rast_out, pos_clip, self.pos_idx)
567
+
568
+ image = depth[0, ...]
569
+ if return_type == 'np':
570
+ image = image.cpu().numpy()
571
+ elif return_type == 'pl':
572
+ image = image.squeeze(-1).cpu().numpy() * 255
573
+ image = Image.fromarray(image.astype(np.uint8))
574
+ return image
575
+
576
+ def render_position(self, elev, azim, camera_distance=None, center=None,
577
+ resolution=None, bg_color=[1, 1, 1], return_type='th'):
578
+ pos_camera, pos_clip = self.get_pos_from_mvp(elev, azim, camera_distance, center)
579
+ if resolution is None:
580
+ resolution = self.default_resolution
581
+ if isinstance(resolution, (int, float)):
582
+ resolution = [resolution, resolution]
583
+ rast_out, rast_out_db = self.raster_rasterize(
584
+ pos_clip, self.pos_idx, resolution=resolution)
585
+
586
+ tex_position = 0.5 - self.vtx_pos[:, :3] / self.scale_factor
587
+ tex_position = tex_position.contiguous()
588
+
589
+ # Interpolate depth values across the rasterized pixels
590
+ position, _ = self.raster_interpolate(
591
+ tex_position[None, ...], rast_out, self.pos_idx)
592
+
593
+ visible_mask = torch.clamp(rast_out[..., -1:], 0, 1)
594
+
595
+ position = position * visible_mask + \
596
+ torch.tensor(bg_color, dtype=torch.float32, device=self.device) * (1 -
597
+ visible_mask) # Mask out background.
598
+ if self.use_antialias:
599
+ position = self.raster_antialias(position, rast_out, pos_clip, self.pos_idx)
600
+
601
+ image = position[0, ...]
602
+
603
+ if return_type == 'np':
604
+ image = image.cpu().numpy()
605
+ elif return_type == 'pl':
606
+ image = image.squeeze(-1).cpu().numpy() * 255
607
+ image = Image.fromarray(image.astype(np.uint8))
608
+ return image
609
+
610
+ def render_uvpos(self, return_type='th'):
611
+ image = self.uv_feature_map(self.vtx_pos * 0.5 + 0.5)
612
+ if return_type == 'np':
613
+ image = image.cpu().numpy()
614
+ elif return_type == 'pl':
615
+ image = image.cpu().numpy() * 255
616
+ image = Image.fromarray(image.astype(np.uint8))
617
+ return image
618
+
619
+ def uv_feature_map(self, vert_feat, bg=None):
620
+ vtx_uv = self.vtx_uv * 2 - 1.0
621
+ vtx_uv = torch.cat(
622
+ [vtx_uv, torch.zeros_like(self.vtx_uv)], dim=1).unsqueeze(0)
623
+ vtx_uv[..., -1] = 1
624
+ uv_idx = self.uv_idx
625
+ rast_out, rast_out_db = self.raster_rasterize(
626
+ vtx_uv, uv_idx, resolution=self.texture_size)
627
+ feat_map, _ = self.raster_interpolate(vert_feat[None, ...], rast_out, uv_idx)
628
+ feat_map = feat_map[0, ...]
629
+ if bg is not None:
630
+ visible_mask = torch.clamp(rast_out[..., -1:], 0, 1)[0, ...]
631
+ feat_map[visible_mask == 0] = bg
632
+ return feat_map
633
+
634
+ def render_sketch_from_geometry(self, normal_image, depth_image):
635
+ normal_image_np = normal_image.cpu().numpy()
636
+ depth_image_np = depth_image.cpu().numpy()
637
+
638
+ normal_image_np = (normal_image_np * 255).astype(np.uint8)
639
+ depth_image_np = (depth_image_np * 255).astype(np.uint8)
640
+ normal_image_np = cv2.cvtColor(normal_image_np, cv2.COLOR_RGB2GRAY)
641
+
642
+ normal_edges = cv2.Canny(normal_image_np, 80, 150)
643
+ depth_edges = cv2.Canny(depth_image_np, 30, 80)
644
+
645
+ combined_edges = np.maximum(normal_edges, depth_edges)
646
+
647
+ sketch_image = torch.from_numpy(combined_edges).to(
648
+ normal_image.device).float() / 255.0
649
+ sketch_image = sketch_image.unsqueeze(-1)
650
+
651
+ return sketch_image
652
+
653
+ def render_sketch_from_depth(self, depth_image):
654
+ depth_image_np = depth_image.cpu().numpy()
655
+ depth_image_np = (depth_image_np * 255).astype(np.uint8)
656
+ depth_edges = cv2.Canny(depth_image_np, 30, 80)
657
+ combined_edges = depth_edges
658
+ sketch_image = torch.from_numpy(combined_edges).to(
659
+ depth_image.device).float() / 255.0
660
+ sketch_image = sketch_image.unsqueeze(-1)
661
+ return sketch_image
662
+
663
+ def back_project(self, image, elev, azim,
664
+ camera_distance=None, center=None, method=None):
665
+ if isinstance(image, Image.Image):
666
+ image = torch.tensor(np.array(image) / 255.0)
667
+ elif isinstance(image, np.ndarray):
668
+ image = torch.tensor(image)
669
+ if image.dim() == 2:
670
+ image = image.unsqueeze(-1)
671
+ image = image.float().to(self.device)
672
+ resolution = image.shape[:2]
673
+ channel = image.shape[-1]
674
+ texture = torch.zeros(self.texture_size + (channel,)).to(self.device)
675
+ cos_map = torch.zeros(self.texture_size + (1,)).to(self.device)
676
+
677
+ proj = self.camera_proj_mat
678
+ r_mv = get_mv_matrix(
679
+ elev=elev,
680
+ azim=azim,
681
+ camera_distance=self.camera_distance if camera_distance is None else camera_distance,
682
+ center=center)
683
+ pos_camera = transform_pos(r_mv, self.vtx_pos, keepdim=True)
684
+ pos_clip = transform_pos(proj, pos_camera)
685
+ pos_camera = pos_camera[:, :3] / pos_camera[:, 3:4]
686
+ v0 = pos_camera[self.pos_idx[:, 0], :]
687
+ v1 = pos_camera[self.pos_idx[:, 1], :]
688
+ v2 = pos_camera[self.pos_idx[:, 2], :]
689
+ face_normals = F.normalize(
690
+ torch.cross(
691
+ v1 - v0,
692
+ v2 - v0,
693
+ dim=-1),
694
+ dim=-1)
695
+ vertex_normals = trimesh.geometry.mean_vertex_normals(vertex_count=self.vtx_pos.shape[0],
696
+ faces=self.pos_idx.cpu(),
697
+ face_normals=face_normals.cpu(), )
698
+ vertex_normals = torch.from_numpy(
699
+ vertex_normals).float().to(self.device).contiguous()
700
+ tex_depth = pos_camera[:, 2].reshape(1, -1, 1).contiguous()
701
+ rast_out, rast_out_db = self.raster_rasterize(
702
+ pos_clip, self.pos_idx, resolution=resolution)
703
+ visible_mask = torch.clamp(rast_out[..., -1:], 0, 1)[0, ...]
704
+
705
+ normal, _ = self.raster_interpolate(
706
+ vertex_normals[None, ...], rast_out, self.pos_idx)
707
+ normal = normal[0, ...]
708
+ uv, _ = self.raster_interpolate(self.vtx_uv[None, ...], rast_out, self.uv_idx)
709
+ depth, _ = self.raster_interpolate(tex_depth, rast_out, self.pos_idx)
710
+ depth = depth[0, ...]
711
+
712
+ depth_max, depth_min = depth[visible_mask >
713
+ 0].max(), depth[visible_mask > 0].min()
714
+ depth_normalized = (depth - depth_min) / (depth_max - depth_min)
715
+ depth_image = depth_normalized * visible_mask # Mask out background.
716
+
717
+ sketch_image = self.render_sketch_from_depth(depth_image)
718
+
719
+ lookat = torch.tensor([[0, 0, -1]], device=self.device)
720
+ cos_image = torch.nn.functional.cosine_similarity(
721
+ lookat, normal.view(-1, 3))
722
+ cos_image = cos_image.view(normal.shape[0], normal.shape[1], 1)
723
+
724
+ cos_thres = np.cos(self.bake_angle_thres / 180 * np.pi)
725
+ cos_image[cos_image < cos_thres] = 0
726
+
727
+ # shrink
728
+ kernel_size = self.bake_unreliable_kernel_size * 2 + 1
729
+ kernel = torch.ones(
730
+ (1, 1, kernel_size, kernel_size), dtype=torch.float32).to(
731
+ sketch_image.device)
732
+
733
+ visible_mask = visible_mask.permute(2, 0, 1).unsqueeze(0).float()
734
+ visible_mask = F.conv2d(
735
+ 1.0 - visible_mask,
736
+ kernel,
737
+ padding=kernel_size // 2)
738
+ visible_mask = 1.0 - (visible_mask > 0).float() # 二值化
739
+ visible_mask = visible_mask.squeeze(0).permute(1, 2, 0)
740
+
741
+ sketch_image = sketch_image.permute(2, 0, 1).unsqueeze(0)
742
+ sketch_image = F.conv2d(sketch_image, kernel, padding=kernel_size // 2)
743
+ sketch_image = (sketch_image > 0).float() # 二值化
744
+ sketch_image = sketch_image.squeeze(0).permute(1, 2, 0)
745
+ visible_mask = visible_mask * (sketch_image < 0.5)
746
+
747
+ cos_image[visible_mask == 0] = 0
748
+
749
+ method = self.bake_mode if method is None else method
750
+
751
+ if method == 'linear':
752
+ proj_mask = (visible_mask != 0).view(-1)
753
+ uv = uv.squeeze(0).contiguous().view(-1, 2)[proj_mask]
754
+ image = image.squeeze(0).contiguous().view(-1, channel)[proj_mask]
755
+ cos_image = cos_image.contiguous().view(-1, 1)[proj_mask]
756
+ sketch_image = sketch_image.contiguous().view(-1, 1)[proj_mask]
757
+
758
+ texture = linear_grid_put_2d(
759
+ self.texture_size[1], self.texture_size[0], uv[..., [1, 0]], image)
760
+ cos_map = linear_grid_put_2d(
761
+ self.texture_size[1], self.texture_size[0], uv[..., [1, 0]], cos_image)
762
+ boundary_map = linear_grid_put_2d(
763
+ self.texture_size[1], self.texture_size[0], uv[..., [1, 0]], sketch_image)
764
+ else:
765
+ raise f'No bake mode {method}'
766
+
767
+ return texture, cos_map, boundary_map
768
+
769
+ def bake_texture(self, colors, elevs, azims,
770
+ camera_distance=None, center=None, exp=6, weights=None):
771
+ for i in range(len(colors)):
772
+ if isinstance(colors[i], Image.Image):
773
+ colors[i] = torch.tensor(
774
+ np.array(
775
+ colors[i]) / 255.0,
776
+ device=self.device).float()
777
+ if weights is None:
778
+ weights = [1.0 for _ in range(colors)]
779
+ textures = []
780
+ cos_maps = []
781
+ for color, elev, azim, weight in zip(colors, elevs, azims, weights):
782
+ texture, cos_map, _ = self.back_project(
783
+ color, elev, azim, camera_distance, center)
784
+ cos_map = weight * (cos_map ** exp)
785
+ textures.append(texture)
786
+ cos_maps.append(cos_map)
787
+
788
+ texture_merge, trust_map_merge = self.fast_bake_texture(
789
+ textures, cos_maps)
790
+ return texture_merge, trust_map_merge
791
+
792
+ @torch.no_grad()
793
+ def fast_bake_texture(self, textures, cos_maps):
794
+
795
+ channel = textures[0].shape[-1]
796
+ texture_merge = torch.zeros(
797
+ self.texture_size + (channel,)).to(self.device)
798
+ trust_map_merge = torch.zeros(self.texture_size + (1,)).to(self.device)
799
+ for texture, cos_map in zip(textures, cos_maps):
800
+ view_sum = (cos_map > 0).sum()
801
+ painted_sum = ((cos_map > 0) * (trust_map_merge > 0)).sum()
802
+ if painted_sum / view_sum > 0.99:
803
+ continue
804
+ texture_merge += texture * cos_map
805
+ trust_map_merge += cos_map
806
+ texture_merge = texture_merge / torch.clamp(trust_map_merge, min=1E-8)
807
+
808
+ return texture_merge, trust_map_merge > 1E-8
809
+
810
+ def uv_inpaint(self, texture, mask):
811
+
812
+ if isinstance(texture, torch.Tensor):
813
+ texture_np = texture.cpu().numpy()
814
+ elif isinstance(texture, np.ndarray):
815
+ texture_np = texture
816
+ elif isinstance(texture, Image.Image):
817
+ texture_np = np.array(texture) / 255.0
818
+
819
+ vtx_pos, pos_idx, vtx_uv, uv_idx = self.get_mesh()
820
+
821
+ texture_np, mask = meshVerticeInpaint(
822
+ texture_np, mask, vtx_pos, vtx_uv, pos_idx, uv_idx)
823
+
824
+ texture_np = cv2.inpaint(
825
+ (texture_np *
826
+ 255).astype(
827
+ np.uint8),
828
+ 255 -
829
+ mask,
830
+ 3,
831
+ cv2.INPAINT_NS)
832
+
833
+ return texture_np
build/lib/hy3dgen/texgen/differentiable_renderer/mesh_utils.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source Model Licensed under the Apache License Version 2.0
2
+ # and Other Licenses of the Third-Party Components therein:
3
+ # The below Model in this distribution may have been modified by THL A29 Limited
4
+ # ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
5
+
6
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
7
+ # The below software and/or models in this distribution may have been
8
+ # modified by THL A29 Limited ("Tencent Modifications").
9
+ # All Tencent Modifications are Copyright (C) THL A29 Limited.
10
+
11
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
12
+ # except for the third-party components listed below.
13
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
14
+ # in the repsective licenses of these third-party components.
15
+ # Users must comply with all terms and conditions of original licenses of these third-party
16
+ # components and must ensure that the usage of the third party components adheres to
17
+ # all relevant laws and regulations.
18
+
19
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
20
+ # their software and algorithms, including trained model weights, parameters (including
21
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
22
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
23
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
24
+
25
+ import trimesh
26
+
27
+
28
+ def load_mesh(mesh):
29
+ vtx_pos = mesh.vertices if hasattr(mesh, 'vertices') else None
30
+ pos_idx = mesh.faces if hasattr(mesh, 'faces') else None
31
+
32
+ vtx_uv = mesh.visual.uv if hasattr(mesh.visual, 'uv') else None
33
+ uv_idx = mesh.faces if hasattr(mesh, 'faces') else None
34
+
35
+ texture_data = None
36
+
37
+ return vtx_pos, pos_idx, vtx_uv, uv_idx, texture_data
38
+
39
+
40
+ def save_mesh(mesh, texture_data):
41
+ material = trimesh.visual.texture.SimpleMaterial(image=texture_data, diffuse=(255, 255, 255))
42
+ texture_visuals = trimesh.visual.TextureVisuals(uv=mesh.visual.uv, image=texture_data, material=material)
43
+ mesh.visual = texture_visuals
44
+ return mesh
build/lib/hy3dgen/texgen/differentiable_renderer/setup.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setuptools import setup, Extension
2
+ import pybind11
3
+ import sys
4
+ import platform
5
+
6
+ def get_platform_specific_args():
7
+ system = platform.system().lower()
8
+ cpp_std = 'c++14' # Make configurable if needed
9
+
10
+ if sys.platform == 'win32':
11
+ compile_args = ['/O2', f'/std:{cpp_std}', '/EHsc', '/MP', '/DWIN32_LEAN_AND_MEAN', '/bigobj']
12
+ link_args = []
13
+ extra_includes = []
14
+ elif system == 'linux':
15
+ compile_args = ['-O3', f'-std={cpp_std}', '-fPIC', '-Wall', '-Wextra', '-pthread']
16
+ link_args = ['-fPIC', '-pthread']
17
+ extra_includes = []
18
+ elif sys.platform == 'darwin':
19
+ compile_args = ['-O3', f'-std={cpp_std}', '-fPIC', '-Wall', '-Wextra',
20
+ '-stdlib=libc++', '-mmacosx-version-min=10.14']
21
+ link_args = ['-fPIC', '-stdlib=libc++', '-mmacosx-version-min=10.14', '-dynamiclib']
22
+ extra_includes = []
23
+ else:
24
+ raise RuntimeError(f"Unsupported platform: {system}")
25
+
26
+ return compile_args, link_args, extra_includes
27
+
28
+ extra_compile_args, extra_link_args, platform_includes = get_platform_specific_args()
29
+ include_dirs = [pybind11.get_include(), pybind11.get_include(user=True)]
30
+ include_dirs.extend(platform_includes)
31
+
32
+ ext_modules = [
33
+ Extension(
34
+ "mesh_processor",
35
+ ["mesh_processor.cpp"],
36
+ include_dirs=include_dirs,
37
+ language='c++',
38
+ extra_compile_args=extra_compile_args,
39
+ extra_link_args=extra_link_args,
40
+ ),
41
+ ]
42
+
43
+ setup(
44
+ name="mesh_processor",
45
+ ext_modules=ext_modules,
46
+ install_requires=['pybind11>=2.6.0'],
47
+ python_requires='>=3.6',
48
+ )
build/lib/hy3dgen/texgen/hunyuanpaint/__init__.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source Model Licensed under the Apache License Version 2.0
2
+ # and Other Licenses of the Third-Party Components therein:
3
+ # The below Model in this distribution may have been modified by THL A29 Limited
4
+ # ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
5
+
6
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
7
+ # The below software and/or models in this distribution may have been
8
+ # modified by THL A29 Limited ("Tencent Modifications").
9
+ # All Tencent Modifications are Copyright (C) THL A29 Limited.
10
+
11
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
12
+ # except for the third-party components listed below.
13
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
14
+ # in the repsective licenses of these third-party components.
15
+ # Users must comply with all terms and conditions of original licenses of these third-party
16
+ # components and must ensure that the usage of the third party components adheres to
17
+ # all relevant laws and regulations.
18
+
19
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
20
+ # their software and algorithms, including trained model weights, parameters (including
21
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
22
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
23
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
build/lib/hy3dgen/texgen/hunyuanpaint/pipeline.py ADDED
@@ -0,0 +1,554 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source Model Licensed under the Apache License Version 2.0
2
+ # and Other Licenses of the Third-Party Components therein:
3
+ # The below Model in this distribution may have been modified by THL A29 Limited
4
+ # ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
5
+
6
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
7
+ # The below software and/or models in this distribution may have been
8
+ # modified by THL A29 Limited ("Tencent Modifications").
9
+ # All Tencent Modifications are Copyright (C) THL A29 Limited.
10
+
11
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
12
+ # except for the third-party components listed below.
13
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
14
+ # in the repsective licenses of these third-party components.
15
+ # Users must comply with all terms and conditions of original licenses of these third-party
16
+ # components and must ensure that the usage of the third party components adheres to
17
+ # all relevant laws and regulations.
18
+
19
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
20
+ # their software and algorithms, including trained model weights, parameters (including
21
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
22
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
23
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
24
+
25
+ from typing import Any, Callable, Dict, List, Optional, Union
26
+
27
+ import numpy
28
+ import numpy as np
29
+ import torch
30
+ import torch.distributed
31
+ import torch.utils.checkpoint
32
+ from PIL import Image
33
+ from diffusers import (
34
+ AutoencoderKL,
35
+ DiffusionPipeline,
36
+ ImagePipelineOutput
37
+ )
38
+ from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
39
+ from diffusers.image_processor import PipelineImageInput
40
+ from diffusers.image_processor import VaeImageProcessor
41
+ from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
42
+ from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipeline, retrieve_timesteps, \
43
+ rescale_noise_cfg
44
+ from diffusers.schedulers import KarrasDiffusionSchedulers
45
+ from diffusers.utils import deprecate
46
+ from einops import rearrange
47
+ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
48
+
49
+ from .unet.modules import UNet2p5DConditionModel
50
+
51
+
52
+ def to_rgb_image(maybe_rgba: Image.Image):
53
+ if maybe_rgba.mode == 'RGB':
54
+ return maybe_rgba
55
+ elif maybe_rgba.mode == 'RGBA':
56
+ rgba = maybe_rgba
57
+ img = numpy.random.randint(127, 128, size=[rgba.size[1], rgba.size[0], 3], dtype=numpy.uint8)
58
+ img = Image.fromarray(img, 'RGB')
59
+ img.paste(rgba, mask=rgba.getchannel('A'))
60
+ return img
61
+ else:
62
+ raise ValueError("Unsupported image type.", maybe_rgba.mode)
63
+
64
+
65
+ class HunyuanPaintPipeline(StableDiffusionPipeline):
66
+
67
+ def __init__(
68
+ self,
69
+ vae: AutoencoderKL,
70
+ text_encoder: CLIPTextModel,
71
+ tokenizer: CLIPTokenizer,
72
+ unet: UNet2p5DConditionModel,
73
+ scheduler: KarrasDiffusionSchedulers,
74
+ feature_extractor: CLIPImageProcessor,
75
+ safety_checker=None,
76
+ use_torch_compile=False,
77
+ ):
78
+ DiffusionPipeline.__init__(self)
79
+
80
+ safety_checker = None
81
+ self.register_modules(
82
+ vae=torch.compile(vae) if use_torch_compile else vae,
83
+ text_encoder=text_encoder,
84
+ tokenizer=tokenizer,
85
+ unet=unet,
86
+ scheduler=scheduler,
87
+ safety_checker=safety_checker,
88
+ feature_extractor=torch.compile(feature_extractor) if use_torch_compile else feature_extractor,
89
+ )
90
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
91
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
92
+
93
+ @torch.no_grad()
94
+ def encode_images(self, images):
95
+ B = images.shape[0]
96
+ images = rearrange(images, 'b n c h w -> (b n) c h w')
97
+
98
+ dtype = next(self.vae.parameters()).dtype
99
+ images = (images - 0.5) * 2.0
100
+ posterior = self.vae.encode(images.to(dtype)).latent_dist
101
+ latents = posterior.sample() * self.vae.config.scaling_factor
102
+
103
+ latents = rearrange(latents, '(b n) c h w -> b n c h w', b=B)
104
+ return latents
105
+
106
+ @torch.no_grad()
107
+ def __call__(
108
+ self,
109
+ image: Image.Image = None,
110
+ prompt=None,
111
+ negative_prompt='watermark, ugly, deformed, noisy, blurry, low contrast',
112
+ *args,
113
+ num_images_per_prompt: Optional[int] = 1,
114
+ guidance_scale=2.0,
115
+ output_type: Optional[str] = "pil",
116
+ width=512,
117
+ height=512,
118
+ num_inference_steps=28,
119
+ return_dict=True,
120
+ **cached_condition,
121
+ ):
122
+ if image is None:
123
+ raise ValueError("Inputting embeddings not supported for this pipeline. Please pass an image.")
124
+ assert not isinstance(image, torch.Tensor)
125
+
126
+ image = to_rgb_image(image)
127
+
128
+ image_vae = torch.tensor(np.array(image) / 255.0)
129
+ image_vae = image_vae.unsqueeze(0).permute(0, 3, 1, 2).unsqueeze(0)
130
+ image_vae = image_vae.to(device=self.vae.device, dtype=self.vae.dtype)
131
+
132
+ batch_size = image_vae.shape[0]
133
+ assert batch_size == 1
134
+ assert num_images_per_prompt == 1
135
+
136
+ ref_latents = self.encode_images(image_vae)
137
+
138
+ def convert_pil_list_to_tensor(images):
139
+ bg_c = [1., 1., 1.]
140
+ images_tensor = []
141
+ for batch_imgs in images:
142
+ view_imgs = []
143
+ for pil_img in batch_imgs:
144
+ img = numpy.asarray(pil_img, dtype=numpy.float32) / 255.
145
+ if img.shape[2] > 3:
146
+ alpha = img[:, :, 3:]
147
+ img = img[:, :, :3] * alpha + bg_c * (1 - alpha)
148
+ img = torch.from_numpy(img).permute(2, 0, 1).unsqueeze(0).contiguous().half().to("cuda")
149
+ view_imgs.append(img)
150
+ view_imgs = torch.cat(view_imgs, dim=0)
151
+ images_tensor.append(view_imgs.unsqueeze(0))
152
+
153
+ images_tensor = torch.cat(images_tensor, dim=0)
154
+ return images_tensor
155
+
156
+ if "normal_imgs" in cached_condition:
157
+
158
+ if isinstance(cached_condition["normal_imgs"], List):
159
+ cached_condition["normal_imgs"] = convert_pil_list_to_tensor(cached_condition["normal_imgs"])
160
+
161
+ cached_condition['normal_imgs'] = self.encode_images(cached_condition["normal_imgs"])
162
+
163
+ if "position_imgs" in cached_condition:
164
+
165
+ if isinstance(cached_condition["position_imgs"], List):
166
+ cached_condition["position_imgs"] = convert_pil_list_to_tensor(cached_condition["position_imgs"])
167
+
168
+ cached_condition["position_imgs"] = self.encode_images(cached_condition["position_imgs"])
169
+
170
+ if 'camera_info_gen' in cached_condition:
171
+ camera_info = cached_condition['camera_info_gen'] # B,N
172
+ if isinstance(camera_info, List):
173
+ camera_info = torch.tensor(camera_info)
174
+ camera_info = camera_info.to(image_vae.device).to(torch.int64)
175
+ cached_condition['camera_info_gen'] = camera_info
176
+ if 'camera_info_ref' in cached_condition:
177
+ camera_info = cached_condition['camera_info_ref'] # B,N
178
+ if isinstance(camera_info, List):
179
+ camera_info = torch.tensor(camera_info)
180
+ camera_info = camera_info.to(image_vae.device).to(torch.int64)
181
+ cached_condition['camera_info_ref'] = camera_info
182
+
183
+ cached_condition['ref_latents'] = ref_latents
184
+
185
+ if guidance_scale > 1:
186
+ negative_ref_latents = torch.zeros_like(cached_condition['ref_latents'])
187
+ cached_condition['ref_latents'] = torch.cat([negative_ref_latents, cached_condition['ref_latents']])
188
+ cached_condition['ref_scale'] = torch.as_tensor([0.0, 1.0]).to(cached_condition['ref_latents'])
189
+ if "normal_imgs" in cached_condition:
190
+ cached_condition['normal_imgs'] = torch.cat(
191
+ (cached_condition['normal_imgs'], cached_condition['normal_imgs']))
192
+
193
+ if "position_imgs" in cached_condition:
194
+ cached_condition['position_imgs'] = torch.cat(
195
+ (cached_condition['position_imgs'], cached_condition['position_imgs']))
196
+
197
+ if 'position_maps' in cached_condition:
198
+ cached_condition['position_maps'] = torch.cat(
199
+ (cached_condition['position_maps'], cached_condition['position_maps']))
200
+
201
+ if 'camera_info_gen' in cached_condition:
202
+ cached_condition['camera_info_gen'] = torch.cat(
203
+ (cached_condition['camera_info_gen'], cached_condition['camera_info_gen']))
204
+ if 'camera_info_ref' in cached_condition:
205
+ cached_condition['camera_info_ref'] = torch.cat(
206
+ (cached_condition['camera_info_ref'], cached_condition['camera_info_ref']))
207
+
208
+ prompt_embeds = self.unet.learned_text_clip_gen.repeat(num_images_per_prompt, 1, 1)
209
+ negative_prompt_embeds = torch.zeros_like(prompt_embeds)
210
+
211
+ latents: torch.Tensor = self.denoise(
212
+ None,
213
+ *args,
214
+ cross_attention_kwargs=None,
215
+ guidance_scale=guidance_scale,
216
+ num_images_per_prompt=num_images_per_prompt,
217
+ prompt_embeds=prompt_embeds,
218
+ negative_prompt_embeds=negative_prompt_embeds,
219
+ num_inference_steps=num_inference_steps,
220
+ output_type='latent',
221
+ width=width,
222
+ height=height,
223
+ **cached_condition
224
+ ).images
225
+
226
+ if not output_type == "latent":
227
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
228
+ else:
229
+ image = latents
230
+
231
+ image = self.image_processor.postprocess(image, output_type=output_type)
232
+ if not return_dict:
233
+ return (image,)
234
+
235
+ return ImagePipelineOutput(images=image)
236
+
237
+ def denoise(
238
+ self,
239
+ prompt: Union[str, List[str]] = None,
240
+ height: Optional[int] = None,
241
+ width: Optional[int] = None,
242
+ num_inference_steps: int = 50,
243
+ timesteps: List[int] = None,
244
+ sigmas: List[float] = None,
245
+ guidance_scale: float = 7.5,
246
+ negative_prompt: Optional[Union[str, List[str]]] = None,
247
+ num_images_per_prompt: Optional[int] = 1,
248
+ eta: float = 0.0,
249
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
250
+ latents: Optional[torch.Tensor] = None,
251
+ prompt_embeds: Optional[torch.Tensor] = None,
252
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
253
+ ip_adapter_image: Optional[PipelineImageInput] = None,
254
+ ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
255
+ output_type: Optional[str] = "pil",
256
+ return_dict: bool = True,
257
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
258
+ guidance_rescale: float = 0.0,
259
+ clip_skip: Optional[int] = None,
260
+ callback_on_step_end: Optional[
261
+ Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
262
+ ] = None,
263
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
264
+ **kwargs,
265
+ ):
266
+ r"""
267
+ The call function to the pipeline for generation.
268
+
269
+ Args:
270
+ prompt (`str` or `List[str]`, *optional*):
271
+ The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
272
+ height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
273
+ The height in pixels of the generated image.
274
+ width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
275
+ The width in pixels of the generated image.
276
+ num_inference_steps (`int`, *optional*, defaults to 50):
277
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
278
+ expense of slower inference.
279
+ timesteps (`List[int]`, *optional*):
280
+ Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
281
+ in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
282
+ passed will be used. Must be in descending order.
283
+ sigmas (`List[float]`, *optional*):
284
+ Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
285
+ their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
286
+ will be used.
287
+ guidance_scale (`float`, *optional*, defaults to 7.5):
288
+ A higher guidance scale value encourages the model to generate images closely linked to the text
289
+ `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
290
+ negative_prompt (`str` or `List[str]`, *optional*):
291
+ The prompt or prompts to guide what to not include in image generation. If not defined, you need to
292
+ pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
293
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
294
+ The number of images to generate per prompt.
295
+ eta (`float`, *optional*, defaults to 0.0):
296
+ Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
297
+ to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
298
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
299
+ A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
300
+ generation deterministic.
301
+ latents (`torch.Tensor`, *optional*):
302
+ Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
303
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
304
+ tensor is generated by sampling using the supplied random `generator`.
305
+ prompt_embeds (`torch.Tensor`, *optional*):
306
+ Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
307
+ provided, text embeddings are generated from the `prompt` input argument.
308
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
309
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
310
+ not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
311
+ ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
312
+ ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
313
+ Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
314
+ IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
315
+ contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
316
+ provided, embeddings are computed from the `ip_adapter_image` input argument.
317
+ output_type (`str`, *optional*, defaults to `"pil"`):
318
+ The output format of the generated image. Choose between `PIL.Image` or `np.array`.
319
+ return_dict (`bool`, *optional*, defaults to `True`):
320
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
321
+ plain tuple.
322
+ cross_attention_kwargs (`dict`, *optional*):
323
+ A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
324
+ [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
325
+ guidance_rescale (`float`, *optional*, defaults to 0.0):
326
+ Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
327
+ Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
328
+ using zero terminal SNR.
329
+ clip_skip (`int`, *optional*):
330
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
331
+ the output of the pre-final layer will be used for computing the prompt embeddings.
332
+ callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
333
+ A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
334
+ each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
335
+ DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
336
+ list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
337
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
338
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
339
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
340
+ `._callback_tensor_inputs` attribute of your pipeline class.
341
+
342
+ Examples:
343
+
344
+ Returns:
345
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
346
+ If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
347
+ otherwise a `tuple` is returned where the first element is a list with the generated images and the
348
+ second element is a list of `bool`s indicating whether the corresponding generated image contains
349
+ "not-safe-for-work" (nsfw) content.
350
+ """
351
+
352
+ callback = kwargs.pop("callback", None)
353
+ callback_steps = kwargs.pop("callback_steps", None)
354
+
355
+ if callback is not None:
356
+ deprecate(
357
+ "callback",
358
+ "1.0.0",
359
+ "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
360
+ )
361
+ if callback_steps is not None:
362
+ deprecate(
363
+ "callback_steps",
364
+ "1.0.0",
365
+ "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
366
+ )
367
+
368
+ if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
369
+ callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
370
+
371
+ # 0. Default height and width to unet
372
+ height = height or self.unet.config.sample_size * self.vae_scale_factor
373
+ width = width or self.unet.config.sample_size * self.vae_scale_factor
374
+ # to deal with lora scaling and other possible forward hooks
375
+
376
+ # 1. Check inputs. Raise error if not correct
377
+ self.check_inputs(
378
+ prompt,
379
+ height,
380
+ width,
381
+ callback_steps,
382
+ negative_prompt,
383
+ prompt_embeds,
384
+ negative_prompt_embeds,
385
+ ip_adapter_image,
386
+ ip_adapter_image_embeds,
387
+ callback_on_step_end_tensor_inputs,
388
+ )
389
+
390
+ self._guidance_scale = guidance_scale
391
+ self._guidance_rescale = guidance_rescale
392
+ self._clip_skip = clip_skip
393
+ self._cross_attention_kwargs = cross_attention_kwargs
394
+ self._interrupt = False
395
+
396
+ # 2. Define call parameters
397
+ if prompt is not None and isinstance(prompt, str):
398
+ batch_size = 1
399
+ elif prompt is not None and isinstance(prompt, list):
400
+ batch_size = len(prompt)
401
+ else:
402
+ batch_size = prompt_embeds.shape[0]
403
+
404
+ device = self._execution_device
405
+
406
+ # 3. Encode input prompt
407
+ lora_scale = (
408
+ self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
409
+ )
410
+
411
+ prompt_embeds, negative_prompt_embeds = self.encode_prompt(
412
+ prompt,
413
+ device,
414
+ num_images_per_prompt,
415
+ self.do_classifier_free_guidance,
416
+ negative_prompt,
417
+ prompt_embeds=prompt_embeds,
418
+ negative_prompt_embeds=negative_prompt_embeds,
419
+ lora_scale=lora_scale,
420
+ clip_skip=self.clip_skip,
421
+ )
422
+
423
+ # For classifier free guidance, we need to do two forward passes.
424
+ # Here we concatenate the unconditional and text embeddings into a single batch
425
+ # to avoid doing two forward passes
426
+ if self.do_classifier_free_guidance:
427
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
428
+
429
+ if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
430
+ image_embeds = self.prepare_ip_adapter_image_embeds(
431
+ ip_adapter_image,
432
+ ip_adapter_image_embeds,
433
+ device,
434
+ batch_size * num_images_per_prompt,
435
+ self.do_classifier_free_guidance,
436
+ )
437
+
438
+ # 4. Prepare timesteps
439
+ timesteps, num_inference_steps = retrieve_timesteps(
440
+ self.scheduler, num_inference_steps, device, timesteps, sigmas
441
+ )
442
+ assert num_images_per_prompt == 1
443
+ # 5. Prepare latent variables
444
+ num_channels_latents = self.unet.config.in_channels
445
+ latents = self.prepare_latents(
446
+ batch_size * kwargs['num_in_batch'], # num_images_per_prompt,
447
+ num_channels_latents,
448
+ height,
449
+ width,
450
+ prompt_embeds.dtype,
451
+ device,
452
+ generator,
453
+ latents,
454
+ )
455
+
456
+ # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
457
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
458
+
459
+ # 6.1 Add image embeds for IP-Adapter
460
+ added_cond_kwargs = (
461
+ {"image_embeds": image_embeds}
462
+ if (ip_adapter_image is not None or ip_adapter_image_embeds is not None)
463
+ else None
464
+ )
465
+
466
+ # 6.2 Optionally get Guidance Scale Embedding
467
+ timestep_cond = None
468
+ if self.unet.config.time_cond_proj_dim is not None:
469
+ guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
470
+ timestep_cond = self.get_guidance_scale_embedding(
471
+ guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
472
+ ).to(device=device, dtype=latents.dtype)
473
+
474
+ # 7. Denoising loop
475
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
476
+ self._num_timesteps = len(timesteps)
477
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
478
+ for i, t in enumerate(timesteps):
479
+ if self.interrupt:
480
+ continue
481
+
482
+ # expand the latents if we are doing classifier free guidance
483
+ latents = rearrange(latents, '(b n) c h w -> b n c h w', n=kwargs['num_in_batch'])
484
+ latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
485
+ latent_model_input = rearrange(latent_model_input, 'b n c h w -> (b n) c h w')
486
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
487
+ latent_model_input = rearrange(latent_model_input, '(b n) c h w ->b n c h w', n=kwargs['num_in_batch'])
488
+
489
+ # predict the noise residual
490
+
491
+ noise_pred = self.unet(
492
+ latent_model_input,
493
+ t,
494
+ encoder_hidden_states=prompt_embeds,
495
+ timestep_cond=timestep_cond,
496
+ cross_attention_kwargs=self.cross_attention_kwargs,
497
+ added_cond_kwargs=added_cond_kwargs,
498
+ return_dict=False, **kwargs
499
+ )[0]
500
+ latents = rearrange(latents, 'b n c h w -> (b n) c h w')
501
+ # perform guidance
502
+ if self.do_classifier_free_guidance:
503
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
504
+ noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
505
+
506
+ if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
507
+ # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
508
+ noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
509
+
510
+ # compute the previous noisy sample x_t -> x_t-1
511
+ latents = \
512
+ self.scheduler.step(noise_pred, t, latents[:, :num_channels_latents, :, :], **extra_step_kwargs,
513
+ return_dict=False)[0]
514
+
515
+ if callback_on_step_end is not None:
516
+ callback_kwargs = {}
517
+ for k in callback_on_step_end_tensor_inputs:
518
+ callback_kwargs[k] = locals()[k]
519
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
520
+
521
+ latents = callback_outputs.pop("latents", latents)
522
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
523
+ negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
524
+
525
+ # call the callback, if provided
526
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
527
+ progress_bar.update()
528
+ if callback is not None and i % callback_steps == 0:
529
+ step_idx = i // getattr(self.scheduler, "order", 1)
530
+ callback(step_idx, t, latents)
531
+
532
+ if not output_type == "latent":
533
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
534
+ 0
535
+ ]
536
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
537
+ else:
538
+ image = latents
539
+ has_nsfw_concept = None
540
+
541
+ if has_nsfw_concept is None:
542
+ do_denormalize = [True] * image.shape[0]
543
+ else:
544
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
545
+
546
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
547
+
548
+ # Offload all models
549
+ self.maybe_free_model_hooks()
550
+
551
+ if not return_dict:
552
+ return (image, has_nsfw_concept)
553
+
554
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
build/lib/hy3dgen/texgen/hunyuanpaint/unet/__init__.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source Model Licensed under the Apache License Version 2.0
2
+ # and Other Licenses of the Third-Party Components therein:
3
+ # The below Model in this distribution may have been modified by THL A29 Limited
4
+ # ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
5
+
6
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
7
+ # The below software and/or models in this distribution may have been
8
+ # modified by THL A29 Limited ("Tencent Modifications").
9
+ # All Tencent Modifications are Copyright (C) THL A29 Limited.
10
+
11
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
12
+ # except for the third-party components listed below.
13
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
14
+ # in the repsective licenses of these third-party components.
15
+ # Users must comply with all terms and conditions of original licenses of these third-party
16
+ # components and must ensure that the usage of the third party components adheres to
17
+ # all relevant laws and regulations.
18
+
19
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
20
+ # their software and algorithms, including trained model weights, parameters (including
21
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
22
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
23
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
build/lib/hy3dgen/texgen/hunyuanpaint/unet/modules.py ADDED
@@ -0,0 +1,440 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source Model Licensed under the Apache License Version 2.0
2
+ # and Other Licenses of the Third-Party Components therein:
3
+ # The below Model in this distribution may have been modified by THL A29 Limited
4
+ # ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
5
+
6
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
7
+ # The below software and/or models in this distribution may have been
8
+ # modified by THL A29 Limited ("Tencent Modifications").
9
+ # All Tencent Modifications are Copyright (C) THL A29 Limited.
10
+
11
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
12
+ # except for the third-party components listed below.
13
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
14
+ # in the repsective licenses of these third-party components.
15
+ # Users must comply with all terms and conditions of original licenses of these third-party
16
+ # components and must ensure that the usage of the third party components adheres to
17
+ # all relevant laws and regulations.
18
+
19
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
20
+ # their software and algorithms, including trained model weights, parameters (including
21
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
22
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
23
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
24
+
25
+
26
+ import copy
27
+ import json
28
+ import os
29
+ from typing import Any, Dict, Optional
30
+
31
+ import torch
32
+ import torch.nn as nn
33
+ from diffusers.models import UNet2DConditionModel
34
+ from diffusers.models.attention_processor import Attention
35
+ from diffusers.models.transformers.transformer_2d import BasicTransformerBlock
36
+ from einops import rearrange
37
+
38
+
39
+ def _chunked_feed_forward(ff: nn.Module, hidden_states: torch.Tensor, chunk_dim: int, chunk_size: int):
40
+ # "feed_forward_chunk_size" can be used to save memory
41
+ if hidden_states.shape[chunk_dim] % chunk_size != 0:
42
+ raise ValueError(
43
+ f"`hidden_states` dimension to be chunked: {hidden_states.shape[chunk_dim]} has to be divisible by chunk size: {chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`."
44
+ )
45
+
46
+ num_chunks = hidden_states.shape[chunk_dim] // chunk_size
47
+ ff_output = torch.cat(
48
+ [ff(hid_slice) for hid_slice in hidden_states.chunk(num_chunks, dim=chunk_dim)],
49
+ dim=chunk_dim,
50
+ )
51
+ return ff_output
52
+
53
+
54
+ class Basic2p5DTransformerBlock(torch.nn.Module):
55
+ def __init__(self, transformer: BasicTransformerBlock, layer_name, use_ma=True, use_ra=True) -> None:
56
+ super().__init__()
57
+ self.transformer = transformer
58
+ self.layer_name = layer_name
59
+ self.use_ma = use_ma
60
+ self.use_ra = use_ra
61
+
62
+ # multiview attn
63
+ if self.use_ma:
64
+ self.attn_multiview = Attention(
65
+ query_dim=self.dim,
66
+ heads=self.num_attention_heads,
67
+ dim_head=self.attention_head_dim,
68
+ dropout=self.dropout,
69
+ bias=self.attention_bias,
70
+ cross_attention_dim=None,
71
+ upcast_attention=self.attn1.upcast_attention,
72
+ out_bias=True,
73
+ )
74
+
75
+ # ref attn
76
+ if self.use_ra:
77
+ self.attn_refview = Attention(
78
+ query_dim=self.dim,
79
+ heads=self.num_attention_heads,
80
+ dim_head=self.attention_head_dim,
81
+ dropout=self.dropout,
82
+ bias=self.attention_bias,
83
+ cross_attention_dim=None,
84
+ upcast_attention=self.attn1.upcast_attention,
85
+ out_bias=True,
86
+ )
87
+
88
+ def __getattr__(self, name: str):
89
+ try:
90
+ return super().__getattr__(name)
91
+ except AttributeError:
92
+ return getattr(self.transformer, name)
93
+
94
+ def forward(
95
+ self,
96
+ hidden_states: torch.Tensor,
97
+ attention_mask: Optional[torch.Tensor] = None,
98
+ encoder_hidden_states: Optional[torch.Tensor] = None,
99
+ encoder_attention_mask: Optional[torch.Tensor] = None,
100
+ timestep: Optional[torch.LongTensor] = None,
101
+ cross_attention_kwargs: Dict[str, Any] = None,
102
+ class_labels: Optional[torch.LongTensor] = None,
103
+ added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
104
+ ) -> torch.Tensor:
105
+
106
+ # Notice that normalization is always applied before the real computation in the following blocks.
107
+ # 0. Self-Attention
108
+ batch_size = hidden_states.shape[0]
109
+
110
+ cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
111
+ num_in_batch = cross_attention_kwargs.pop('num_in_batch', 1)
112
+ mode = cross_attention_kwargs.pop('mode', None)
113
+ mva_scale = cross_attention_kwargs.pop('mva_scale', 1.0)
114
+ ref_scale = cross_attention_kwargs.pop('ref_scale', 1.0)
115
+ condition_embed_dict = cross_attention_kwargs.pop("condition_embed_dict", None)
116
+
117
+ if self.norm_type == "ada_norm":
118
+ norm_hidden_states = self.norm1(hidden_states, timestep)
119
+ elif self.norm_type == "ada_norm_zero":
120
+ norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
121
+ hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
122
+ )
123
+ elif self.norm_type in ["layer_norm", "layer_norm_i2vgen"]:
124
+ norm_hidden_states = self.norm1(hidden_states)
125
+ elif self.norm_type == "ada_norm_continuous":
126
+ norm_hidden_states = self.norm1(hidden_states, added_cond_kwargs["pooled_text_emb"])
127
+ elif self.norm_type == "ada_norm_single":
128
+ shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
129
+ self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1)
130
+ ).chunk(6, dim=1)
131
+ norm_hidden_states = self.norm1(hidden_states)
132
+ norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
133
+ else:
134
+ raise ValueError("Incorrect norm used")
135
+
136
+ if self.pos_embed is not None:
137
+ norm_hidden_states = self.pos_embed(norm_hidden_states)
138
+
139
+ # 1. Prepare GLIGEN inputs
140
+ cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
141
+ gligen_kwargs = cross_attention_kwargs.pop("gligen", None)
142
+
143
+ attn_output = self.attn1(
144
+ norm_hidden_states,
145
+ encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
146
+ attention_mask=attention_mask,
147
+ **cross_attention_kwargs,
148
+ )
149
+
150
+ if self.norm_type == "ada_norm_zero":
151
+ attn_output = gate_msa.unsqueeze(1) * attn_output
152
+ elif self.norm_type == "ada_norm_single":
153
+ attn_output = gate_msa * attn_output
154
+
155
+ hidden_states = attn_output + hidden_states
156
+ if hidden_states.ndim == 4:
157
+ hidden_states = hidden_states.squeeze(1)
158
+
159
+ # 1.2 Reference Attention
160
+ if 'w' in mode:
161
+ condition_embed_dict[self.layer_name] = rearrange(norm_hidden_states, '(b n) l c -> b (n l) c',
162
+ n=num_in_batch) # B, (N L), C
163
+
164
+ if 'r' in mode and self.use_ra:
165
+ condition_embed = condition_embed_dict[self.layer_name].unsqueeze(1).repeat(1, num_in_batch, 1,
166
+ 1) # B N L C
167
+ condition_embed = rearrange(condition_embed, 'b n l c -> (b n) l c')
168
+
169
+ attn_output = self.attn_refview(
170
+ norm_hidden_states,
171
+ encoder_hidden_states=condition_embed,
172
+ attention_mask=None,
173
+ **cross_attention_kwargs
174
+ )
175
+ ref_scale_timing = ref_scale
176
+ if isinstance(ref_scale, torch.Tensor):
177
+ ref_scale_timing = ref_scale.unsqueeze(1).repeat(1, num_in_batch).view(-1)
178
+ for _ in range(attn_output.ndim - 1):
179
+ ref_scale_timing = ref_scale_timing.unsqueeze(-1)
180
+ hidden_states = ref_scale_timing * attn_output + hidden_states
181
+ if hidden_states.ndim == 4:
182
+ hidden_states = hidden_states.squeeze(1)
183
+
184
+ # 1.3 Multiview Attention
185
+ if num_in_batch > 1 and self.use_ma:
186
+ multivew_hidden_states = rearrange(norm_hidden_states, '(b n) l c -> b (n l) c', n=num_in_batch)
187
+
188
+ attn_output = self.attn_multiview(
189
+ multivew_hidden_states,
190
+ encoder_hidden_states=multivew_hidden_states,
191
+ **cross_attention_kwargs
192
+ )
193
+
194
+ attn_output = rearrange(attn_output, 'b (n l) c -> (b n) l c', n=num_in_batch)
195
+
196
+ hidden_states = mva_scale * attn_output + hidden_states
197
+ if hidden_states.ndim == 4:
198
+ hidden_states = hidden_states.squeeze(1)
199
+
200
+ # 1.2 GLIGEN Control
201
+ if gligen_kwargs is not None:
202
+ hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])
203
+
204
+ # 3. Cross-Attention
205
+ if self.attn2 is not None:
206
+ if self.norm_type == "ada_norm":
207
+ norm_hidden_states = self.norm2(hidden_states, timestep)
208
+ elif self.norm_type in ["ada_norm_zero", "layer_norm", "layer_norm_i2vgen"]:
209
+ norm_hidden_states = self.norm2(hidden_states)
210
+ elif self.norm_type == "ada_norm_single":
211
+ # For PixArt norm2 isn't applied here:
212
+ # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
213
+ norm_hidden_states = hidden_states
214
+ elif self.norm_type == "ada_norm_continuous":
215
+ norm_hidden_states = self.norm2(hidden_states, added_cond_kwargs["pooled_text_emb"])
216
+ else:
217
+ raise ValueError("Incorrect norm")
218
+
219
+ if self.pos_embed is not None and self.norm_type != "ada_norm_single":
220
+ norm_hidden_states = self.pos_embed(norm_hidden_states)
221
+
222
+ attn_output = self.attn2(
223
+ norm_hidden_states,
224
+ encoder_hidden_states=encoder_hidden_states,
225
+ attention_mask=encoder_attention_mask,
226
+ **cross_attention_kwargs,
227
+ )
228
+
229
+ hidden_states = attn_output + hidden_states
230
+
231
+ # 4. Feed-forward
232
+ # i2vgen doesn't have this norm 🤷‍♂️
233
+ if self.norm_type == "ada_norm_continuous":
234
+ norm_hidden_states = self.norm3(hidden_states, added_cond_kwargs["pooled_text_emb"])
235
+ elif not self.norm_type == "ada_norm_single":
236
+ norm_hidden_states = self.norm3(hidden_states)
237
+
238
+ if self.norm_type == "ada_norm_zero":
239
+ norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
240
+
241
+ if self.norm_type == "ada_norm_single":
242
+ norm_hidden_states = self.norm2(hidden_states)
243
+ norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
244
+
245
+ if self._chunk_size is not None:
246
+ # "feed_forward_chunk_size" can be used to save memory
247
+ ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
248
+ else:
249
+ ff_output = self.ff(norm_hidden_states)
250
+
251
+ if self.norm_type == "ada_norm_zero":
252
+ ff_output = gate_mlp.unsqueeze(1) * ff_output
253
+ elif self.norm_type == "ada_norm_single":
254
+ ff_output = gate_mlp * ff_output
255
+
256
+ hidden_states = ff_output + hidden_states
257
+ if hidden_states.ndim == 4:
258
+ hidden_states = hidden_states.squeeze(1)
259
+
260
+ return hidden_states
261
+
262
+
263
+ class UNet2p5DConditionModel(torch.nn.Module):
264
+ def __init__(self, unet: UNet2DConditionModel) -> None:
265
+ super().__init__()
266
+ self.unet = unet
267
+
268
+ self.use_ma = True
269
+ self.use_ra = True
270
+ self.use_camera_embedding = True
271
+ self.use_dual_stream = True
272
+
273
+ if self.use_dual_stream:
274
+ self.unet_dual = copy.deepcopy(unet)
275
+ self.init_attention(self.unet_dual)
276
+ self.init_attention(self.unet, use_ma=self.use_ma, use_ra=self.use_ra)
277
+ self.init_condition()
278
+ self.init_camera_embedding()
279
+
280
+ @staticmethod
281
+ def from_pretrained(pretrained_model_name_or_path, **kwargs):
282
+ torch_dtype = kwargs.pop('torch_dtype', torch.float32)
283
+ config_path = os.path.join(pretrained_model_name_or_path, 'config.json')
284
+ unet_ckpt_path = os.path.join(pretrained_model_name_or_path, 'diffusion_pytorch_model.bin')
285
+ with open(config_path, 'r', encoding='utf-8') as file:
286
+ config = json.load(file)
287
+ unet = UNet2DConditionModel(**config)
288
+ unet = UNet2p5DConditionModel(unet)
289
+ unet_ckpt = torch.load(unet_ckpt_path, map_location='cpu', weights_only=True)
290
+ unet.load_state_dict(unet_ckpt, strict=True)
291
+ unet = unet.to(torch_dtype)
292
+ return unet
293
+
294
+ def init_condition(self):
295
+ self.unet.conv_in = torch.nn.Conv2d(
296
+ 12,
297
+ self.unet.conv_in.out_channels,
298
+ kernel_size=self.unet.conv_in.kernel_size,
299
+ stride=self.unet.conv_in.stride,
300
+ padding=self.unet.conv_in.padding,
301
+ dilation=self.unet.conv_in.dilation,
302
+ groups=self.unet.conv_in.groups,
303
+ bias=self.unet.conv_in.bias is not None)
304
+
305
+ self.unet.learned_text_clip_gen = nn.Parameter(torch.randn(1, 77, 1024))
306
+ self.unet.learned_text_clip_ref = nn.Parameter(torch.randn(1, 77, 1024))
307
+
308
+ def init_camera_embedding(self):
309
+
310
+ if self.use_camera_embedding:
311
+ time_embed_dim = 1280
312
+ self.max_num_ref_image = 5
313
+ self.max_num_gen_image = 12 * 3 + 4 * 2
314
+ self.unet.class_embedding = nn.Embedding(self.max_num_ref_image + self.max_num_gen_image, time_embed_dim)
315
+
316
+ def init_attention(self, unet, use_ma=False, use_ra=False):
317
+
318
+ for down_block_i, down_block in enumerate(unet.down_blocks):
319
+ if hasattr(down_block, "has_cross_attention") and down_block.has_cross_attention:
320
+ for attn_i, attn in enumerate(down_block.attentions):
321
+ for transformer_i, transformer in enumerate(attn.transformer_blocks):
322
+ if isinstance(transformer, BasicTransformerBlock):
323
+ attn.transformer_blocks[transformer_i] = Basic2p5DTransformerBlock(transformer,
324
+ f'down_{down_block_i}_{attn_i}_{transformer_i}',
325
+ use_ma, use_ra)
326
+
327
+ if hasattr(unet.mid_block, "has_cross_attention") and unet.mid_block.has_cross_attention:
328
+ for attn_i, attn in enumerate(unet.mid_block.attentions):
329
+ for transformer_i, transformer in enumerate(attn.transformer_blocks):
330
+ if isinstance(transformer, BasicTransformerBlock):
331
+ attn.transformer_blocks[transformer_i] = Basic2p5DTransformerBlock(transformer,
332
+ f'mid_{attn_i}_{transformer_i}',
333
+ use_ma, use_ra)
334
+
335
+ for up_block_i, up_block in enumerate(unet.up_blocks):
336
+ if hasattr(up_block, "has_cross_attention") and up_block.has_cross_attention:
337
+ for attn_i, attn in enumerate(up_block.attentions):
338
+ for transformer_i, transformer in enumerate(attn.transformer_blocks):
339
+ if isinstance(transformer, BasicTransformerBlock):
340
+ attn.transformer_blocks[transformer_i] = Basic2p5DTransformerBlock(transformer,
341
+ f'up_{up_block_i}_{attn_i}_{transformer_i}',
342
+ use_ma, use_ra)
343
+
344
+ def __getattr__(self, name: str):
345
+ try:
346
+ return super().__getattr__(name)
347
+ except AttributeError:
348
+ return getattr(self.unet, name)
349
+
350
+ def forward(
351
+ self, sample, timestep, encoder_hidden_states,
352
+ *args, down_intrablock_additional_residuals=None,
353
+ down_block_res_samples=None, mid_block_res_sample=None,
354
+ **cached_condition,
355
+ ):
356
+ B, N_gen, _, H, W = sample.shape
357
+ assert H == W
358
+
359
+ if self.use_camera_embedding:
360
+ camera_info_gen = cached_condition['camera_info_gen'] + self.max_num_ref_image
361
+ camera_info_gen = rearrange(camera_info_gen, 'b n -> (b n)')
362
+ else:
363
+ camera_info_gen = None
364
+
365
+ sample = [sample]
366
+ if 'normal_imgs' in cached_condition:
367
+ sample.append(cached_condition["normal_imgs"])
368
+ if 'position_imgs' in cached_condition:
369
+ sample.append(cached_condition["position_imgs"])
370
+ sample = torch.cat(sample, dim=2)
371
+
372
+ sample = rearrange(sample, 'b n c h w -> (b n) c h w')
373
+
374
+ encoder_hidden_states_gen = encoder_hidden_states.unsqueeze(1).repeat(1, N_gen, 1, 1)
375
+ encoder_hidden_states_gen = rearrange(encoder_hidden_states_gen, 'b n l c -> (b n) l c')
376
+
377
+ if self.use_ra:
378
+ if 'condition_embed_dict' in cached_condition:
379
+ condition_embed_dict = cached_condition['condition_embed_dict']
380
+ else:
381
+ condition_embed_dict = {}
382
+ ref_latents = cached_condition['ref_latents']
383
+ N_ref = ref_latents.shape[1]
384
+ if self.use_camera_embedding:
385
+ camera_info_ref = cached_condition['camera_info_ref']
386
+ camera_info_ref = rearrange(camera_info_ref, 'b n -> (b n)')
387
+ else:
388
+ camera_info_ref = None
389
+
390
+ ref_latents = rearrange(ref_latents, 'b n c h w -> (b n) c h w')
391
+
392
+ encoder_hidden_states_ref = self.unet.learned_text_clip_ref.unsqueeze(1).repeat(B, N_ref, 1, 1)
393
+ encoder_hidden_states_ref = rearrange(encoder_hidden_states_ref, 'b n l c -> (b n) l c')
394
+
395
+ noisy_ref_latents = ref_latents
396
+ timestep_ref = 0
397
+
398
+ if self.use_dual_stream:
399
+ unet_ref = self.unet_dual
400
+ else:
401
+ unet_ref = self.unet
402
+ unet_ref(
403
+ noisy_ref_latents, timestep_ref,
404
+ encoder_hidden_states=encoder_hidden_states_ref,
405
+ class_labels=camera_info_ref,
406
+ # **kwargs
407
+ return_dict=False,
408
+ cross_attention_kwargs={
409
+ 'mode': 'w', 'num_in_batch': N_ref,
410
+ 'condition_embed_dict': condition_embed_dict},
411
+ )
412
+ cached_condition['condition_embed_dict'] = condition_embed_dict
413
+ else:
414
+ condition_embed_dict = None
415
+
416
+ mva_scale = cached_condition.get('mva_scale', 1.0)
417
+ ref_scale = cached_condition.get('ref_scale', 1.0)
418
+
419
+ return self.unet(
420
+ sample, timestep,
421
+ encoder_hidden_states_gen, *args,
422
+ class_labels=camera_info_gen,
423
+ down_intrablock_additional_residuals=[
424
+ sample.to(dtype=self.unet.dtype) for sample in down_intrablock_additional_residuals
425
+ ] if down_intrablock_additional_residuals is not None else None,
426
+ down_block_additional_residuals=[
427
+ sample.to(dtype=self.unet.dtype) for sample in down_block_res_samples
428
+ ] if down_block_res_samples is not None else None,
429
+ mid_block_additional_residual=(
430
+ mid_block_res_sample.to(dtype=self.unet.dtype)
431
+ if mid_block_res_sample is not None else None
432
+ ),
433
+ return_dict=False,
434
+ cross_attention_kwargs={
435
+ 'mode': 'r', 'num_in_batch': N_gen,
436
+ 'condition_embed_dict': condition_embed_dict,
437
+ 'mva_scale': mva_scale,
438
+ 'ref_scale': ref_scale,
439
+ },
440
+ )
build/lib/hy3dgen/texgen/pipelines.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source Model Licensed under the Apache License Version 2.0
2
+ # and Other Licenses of the Third-Party Components therein:
3
+ # The below Model in this distribution may have been modified by THL A29 Limited
4
+ # ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
5
+
6
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
7
+ # The below software and/or models in this distribution may have been
8
+ # modified by THL A29 Limited ("Tencent Modifications").
9
+ # All Tencent Modifications are Copyright (C) THL A29 Limited.
10
+
11
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
12
+ # except for the third-party components listed below.
13
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
14
+ # in the repsective licenses of these third-party components.
15
+ # Users must comply with all terms and conditions of original licenses of these third-party
16
+ # components and must ensure that the usage of the third party components adheres to
17
+ # all relevant laws and regulations.
18
+
19
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
20
+ # their software and algorithms, including trained model weights, parameters (including
21
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
22
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
23
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
24
+
25
+
26
+ import logging
27
+ import os
28
+
29
+ import numpy as np
30
+ import torch
31
+ from PIL import Image
32
+
33
+ from .differentiable_renderer.mesh_render import MeshRender
34
+ from .utils.dehighlight_utils import Light_Shadow_Remover
35
+ from .utils.multiview_utils import Multiview_Diffusion_Net
36
+ from .utils.uv_warp_utils import mesh_uv_wrap
37
+
38
+ logger = logging.getLogger(__name__)
39
+
40
+
41
+ class Hunyuan3DTexGenConfig:
42
+
43
+ def __init__(self, light_remover_ckpt_path, multiview_ckpt_path):
44
+ self.device = 'cpu'
45
+ self.light_remover_ckpt_path = light_remover_ckpt_path
46
+ self.multiview_ckpt_path = multiview_ckpt_path
47
+
48
+ self.candidate_camera_azims = [0, 90, 180, 270, 0, 180]
49
+ self.candidate_camera_elevs = [0, 0, 0, 0, 90, -90]
50
+ self.candidate_view_weights = [1, 0.1, 0.5, 0.1, 0.05, 0.05]
51
+
52
+ self.render_size = 2048
53
+ self.texture_size = 1024
54
+ self.bake_exp = 4
55
+ self.merge_method = 'fast'
56
+
57
+
58
+ class Hunyuan3DPaintPipeline:
59
+ @classmethod
60
+ def from_pretrained(cls, model_path):
61
+ original_model_path = model_path
62
+ if not os.path.exists(model_path):
63
+ # try local path
64
+ base_dir = os.environ.get('HY3DGEN_MODELS', '~/.cache/hy3dgen')
65
+ model_path = os.path.expanduser(os.path.join(base_dir, model_path))
66
+
67
+ delight_model_path = os.path.join(model_path, 'hunyuan3d-delight-v2-0')
68
+ multiview_model_path = os.path.join(model_path, 'hunyuan3d-paint-v2-0')
69
+
70
+ if not os.path.exists(delight_model_path) or not os.path.exists(multiview_model_path):
71
+ try:
72
+ import huggingface_hub
73
+ # download from huggingface
74
+ model_path = huggingface_hub.snapshot_download(repo_id=original_model_path)
75
+ delight_model_path = os.path.join(model_path, 'hunyuan3d-delight-v2-0')
76
+ multiview_model_path = os.path.join(model_path, 'hunyuan3d-paint-v2-0')
77
+ return cls(Hunyuan3DTexGenConfig(delight_model_path, multiview_model_path))
78
+ except ImportError:
79
+ logger.warning(
80
+ "You need to install HuggingFace Hub to load models from the hub."
81
+ )
82
+ raise RuntimeError(f"Model path {model_path} not found")
83
+ else:
84
+ return cls(Hunyuan3DTexGenConfig(delight_model_path, multiview_model_path))
85
+
86
+ raise FileNotFoundError(f"Model path {original_model_path} not found and we could not find it at huggingface")
87
+
88
+ def __init__(self, config):
89
+ self.config = config
90
+ self.models = {}
91
+ self.render = MeshRender(
92
+ default_resolution=self.config.render_size,
93
+ texture_size=self.config.texture_size)
94
+
95
+ self.load_models()
96
+
97
+ def load_models(self):
98
+ # empty cude cache
99
+ torch.cuda.empty_cache()
100
+ # Load model
101
+ self.models['delight_model'] = Light_Shadow_Remover(self.config)
102
+ self.models['multiview_model'] = Multiview_Diffusion_Net(self.config)
103
+
104
+ def render_normal_multiview(self, camera_elevs, camera_azims, use_abs_coor=True):
105
+ normal_maps = []
106
+ for elev, azim in zip(camera_elevs, camera_azims):
107
+ normal_map = self.render.render_normal(
108
+ elev, azim, use_abs_coor=use_abs_coor, return_type='pl')
109
+ normal_maps.append(normal_map)
110
+
111
+ return normal_maps
112
+
113
+ def render_position_multiview(self, camera_elevs, camera_azims):
114
+ position_maps = []
115
+ for elev, azim in zip(camera_elevs, camera_azims):
116
+ position_map = self.render.render_position(
117
+ elev, azim, return_type='pl')
118
+ position_maps.append(position_map)
119
+
120
+ return position_maps
121
+
122
+ def bake_from_multiview(self, views, camera_elevs,
123
+ camera_azims, view_weights, method='graphcut'):
124
+ project_textures, project_weighted_cos_maps = [], []
125
+ project_boundary_maps = []
126
+ for view, camera_elev, camera_azim, weight in zip(
127
+ views, camera_elevs, camera_azims, view_weights):
128
+ project_texture, project_cos_map, project_boundary_map = self.render.back_project(
129
+ view, camera_elev, camera_azim)
130
+ project_cos_map = weight * (project_cos_map ** self.config.bake_exp)
131
+ project_textures.append(project_texture)
132
+ project_weighted_cos_maps.append(project_cos_map)
133
+ project_boundary_maps.append(project_boundary_map)
134
+
135
+ if method == 'fast':
136
+ texture, ori_trust_map = self.render.fast_bake_texture(
137
+ project_textures, project_weighted_cos_maps)
138
+ else:
139
+ raise f'no method {method}'
140
+ return texture, ori_trust_map > 1E-8
141
+
142
+ def texture_inpaint(self, texture, mask):
143
+
144
+ texture_np = self.render.uv_inpaint(texture, mask)
145
+ texture = torch.tensor(texture_np / 255).float().to(texture.device)
146
+
147
+ return texture
148
+
149
+ def recenter_image(self, image, border_ratio=0.2):
150
+ if image.mode == 'RGB':
151
+ return image
152
+ elif image.mode == 'L':
153
+ image = image.convert('RGB')
154
+ return image
155
+
156
+ alpha_channel = np.array(image)[:, :, 3]
157
+ non_zero_indices = np.argwhere(alpha_channel > 0)
158
+ if non_zero_indices.size == 0:
159
+ raise ValueError("Image is fully transparent")
160
+
161
+ min_row, min_col = non_zero_indices.min(axis=0)
162
+ max_row, max_col = non_zero_indices.max(axis=0)
163
+
164
+ cropped_image = image.crop((min_col, min_row, max_col + 1, max_row + 1))
165
+
166
+ width, height = cropped_image.size
167
+ border_width = int(width * border_ratio)
168
+ border_height = int(height * border_ratio)
169
+
170
+ new_width = width + 2 * border_width
171
+ new_height = height + 2 * border_height
172
+
173
+ square_size = max(new_width, new_height)
174
+
175
+ new_image = Image.new('RGBA', (square_size, square_size), (255, 255, 255, 0))
176
+
177
+ paste_x = (square_size - new_width) // 2 + border_width
178
+ paste_y = (square_size - new_height) // 2 + border_height
179
+
180
+ new_image.paste(cropped_image, (paste_x, paste_y))
181
+ return new_image
182
+
183
+ @torch.no_grad()
184
+ def __call__(self, mesh, image):
185
+
186
+ if isinstance(image, str):
187
+ image_prompt = Image.open(image)
188
+ else:
189
+ image_prompt = image
190
+
191
+ image_prompt = self.recenter_image(image_prompt)
192
+
193
+ image_prompt = self.models['delight_model'](image_prompt)
194
+
195
+ mesh = mesh_uv_wrap(mesh)
196
+
197
+ self.render.load_mesh(mesh)
198
+
199
+ selected_camera_elevs, selected_camera_azims, selected_view_weights = \
200
+ self.config.candidate_camera_elevs, self.config.candidate_camera_azims, self.config.candidate_view_weights
201
+
202
+ normal_maps = self.render_normal_multiview(
203
+ selected_camera_elevs, selected_camera_azims, use_abs_coor=True)
204
+ position_maps = self.render_position_multiview(
205
+ selected_camera_elevs, selected_camera_azims)
206
+
207
+ camera_info = [(((azim // 30) + 9) % 12) // {-20: 1, 0: 1, 20: 1, -90: 3, 90: 3}[
208
+ elev] + {-20: 0, 0: 12, 20: 24, -90: 36, 90: 40}[elev] for azim, elev in
209
+ zip(selected_camera_azims, selected_camera_elevs)]
210
+ multiviews = self.models['multiview_model'](image_prompt, normal_maps + position_maps, camera_info)
211
+
212
+ for i in range(len(multiviews)):
213
+ multiviews[i] = multiviews[i].resize(
214
+ (self.config.render_size, self.config.render_size))
215
+
216
+ texture, mask = self.bake_from_multiview(multiviews,
217
+ selected_camera_elevs, selected_camera_azims, selected_view_weights,
218
+ method=self.config.merge_method)
219
+
220
+ mask_np = (mask.squeeze(-1).cpu().numpy() * 255).astype(np.uint8)
221
+
222
+ texture = self.texture_inpaint(texture, mask_np)
223
+
224
+ self.render.set_texture(texture)
225
+ textured_mesh = self.render.save_mesh()
226
+
227
+ return textured_mesh
build/lib/hy3dgen/texgen/utils/__init__.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source Model Licensed under the Apache License Version 2.0
2
+ # and Other Licenses of the Third-Party Components therein:
3
+ # The below Model in this distribution may have been modified by THL A29 Limited
4
+ # ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
5
+
6
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
7
+ # The below software and/or models in this distribution may have been
8
+ # modified by THL A29 Limited ("Tencent Modifications").
9
+ # All Tencent Modifications are Copyright (C) THL A29 Limited.
10
+
11
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
12
+ # except for the third-party components listed below.
13
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
14
+ # in the repsective licenses of these third-party components.
15
+ # Users must comply with all terms and conditions of original licenses of these third-party
16
+ # components and must ensure that the usage of the third party components adheres to
17
+ # all relevant laws and regulations.
18
+
19
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
20
+ # their software and algorithms, including trained model weights, parameters (including
21
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
22
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
23
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
build/lib/hy3dgen/texgen/utils/alignImg4Tex_utils.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source Model Licensed under the Apache License Version 2.0
2
+ # and Other Licenses of the Third-Party Components therein:
3
+ # The below Model in this distribution may have been modified by THL A29 Limited
4
+ # ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
5
+
6
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
7
+ # The below software and/or models in this distribution may have been
8
+ # modified by THL A29 Limited ("Tencent Modifications").
9
+ # All Tencent Modifications are Copyright (C) THL A29 Limited.
10
+
11
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
12
+ # except for the third-party components listed below.
13
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
14
+ # in the repsective licenses of these third-party components.
15
+ # Users must comply with all terms and conditions of original licenses of these third-party
16
+ # components and must ensure that the usage of the third party components adheres to
17
+ # all relevant laws and regulations.
18
+
19
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
20
+ # their software and algorithms, including trained model weights, parameters (including
21
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
22
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
23
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
24
+
25
+
26
+ import torch
27
+ from diffusers import EulerAncestralDiscreteScheduler
28
+ from diffusers import StableDiffusionControlNetPipeline, StableDiffusionXLControlNetImg2ImgPipeline, ControlNetModel, \
29
+ AutoencoderKL
30
+
31
+
32
+ class Img2img_Control_Ip_adapter:
33
+ def __init__(self, device):
34
+ controlnet = ControlNetModel.from_pretrained('lllyasviel/control_v11f1p_sd15_depth', torch_dtype=torch.float16,
35
+ variant="fp16", use_safetensors=True)
36
+ pipe = StableDiffusionControlNetPipeline.from_pretrained(
37
+ 'runwayml/stable-diffusion-v1-5', controlnet=controlnet, torch_dtype=torch.float16, use_safetensors=True
38
+ )
39
+ pipe.load_ip_adapter('h94/IP-Adapter', subfolder="models", weight_name="ip-adapter-plus_sd15.safetensors")
40
+ pipe.set_ip_adapter_scale(0.7)
41
+
42
+ pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
43
+ # pipe.enable_model_cpu_offload()
44
+ self.pipe = pipe.to(device)
45
+
46
+ def __call__(
47
+ self,
48
+ prompt,
49
+ control_image,
50
+ ip_adapter_image,
51
+ negative_prompt,
52
+ height=512,
53
+ width=512,
54
+ num_inference_steps=20,
55
+ guidance_scale=8.0,
56
+ controlnet_conditioning_scale=1.0,
57
+ output_type="pil",
58
+ **kwargs,
59
+ ):
60
+ results = self.pipe(
61
+ prompt=prompt,
62
+ negative_prompt=negative_prompt,
63
+ image=control_image,
64
+ ip_adapter_image=ip_adapter_image,
65
+ generator=torch.manual_seed(42),
66
+ seed=42,
67
+ num_inference_steps=num_inference_steps,
68
+ guidance_scale=guidance_scale,
69
+ controlnet_conditioning_scale=controlnet_conditioning_scale,
70
+ strength=1,
71
+ # clip_skip=2,
72
+ height=height,
73
+ width=width,
74
+ output_type=output_type,
75
+ **kwargs,
76
+ ).images[0]
77
+ return results
78
+
79
+
80
+ ################################################################
81
+
82
+ class HesModel:
83
+ def __init__(self, ):
84
+ controlnet_depth = ControlNetModel.from_pretrained(
85
+ 'diffusers/controlnet-depth-sdxl-1.0',
86
+ torch_dtype=torch.float16,
87
+ variant="fp16",
88
+ use_safetensors=True
89
+ )
90
+ self.pipe = StableDiffusionXLControlNetImg2ImgPipeline.from_pretrained(
91
+ 'stabilityai/stable-diffusion-xl-base-1.0',
92
+ torch_dtype=torch.float16,
93
+ variant="fp16",
94
+ controlnet=controlnet_depth,
95
+ use_safetensors=True,
96
+ )
97
+ self.pipe.vae = AutoencoderKL.from_pretrained(
98
+ 'madebyollin/sdxl-vae-fp16-fix',
99
+ torch_dtype=torch.float16
100
+ )
101
+
102
+ self.pipe.load_ip_adapter('h94/IP-Adapter', subfolder="sdxl_models", weight_name="ip-adapter_sdxl.safetensors")
103
+ self.pipe.set_ip_adapter_scale(0.7)
104
+ self.pipe.to("cuda")
105
+
106
+ def __call__(self,
107
+ init_image,
108
+ control_image,
109
+ ip_adapter_image=None,
110
+ prompt='3D image',
111
+ negative_prompt='2D image',
112
+ seed=42,
113
+ strength=0.8,
114
+ num_inference_steps=40,
115
+ guidance_scale=7.5,
116
+ controlnet_conditioning_scale=0.5,
117
+ **kwargs
118
+ ):
119
+ image = self.pipe(
120
+ prompt=prompt,
121
+ image=init_image,
122
+ control_image=control_image,
123
+ ip_adapter_image=ip_adapter_image,
124
+ negative_prompt=negative_prompt,
125
+ num_inference_steps=num_inference_steps,
126
+ guidance_scale=guidance_scale,
127
+ strength=strength,
128
+ controlnet_conditioning_scale=controlnet_conditioning_scale,
129
+ seed=seed,
130
+ **kwargs
131
+ ).images[0]
132
+ return image
build/lib/hy3dgen/texgen/utils/counter_utils.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source Model Licensed under the Apache License Version 2.0
2
+ # and Other Licenses of the Third-Party Components therein:
3
+ # The below Model in this distribution may have been modified by THL A29 Limited
4
+ # ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
5
+
6
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
7
+ # The below software and/or models in this distribution may have been
8
+ # modified by THL A29 Limited ("Tencent Modifications").
9
+ # All Tencent Modifications are Copyright (C) THL A29 Limited.
10
+
11
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
12
+ # except for the third-party components listed below.
13
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
14
+ # in the repsective licenses of these third-party components.
15
+ # Users must comply with all terms and conditions of original licenses of these third-party
16
+ # components and must ensure that the usage of the third party components adheres to
17
+ # all relevant laws and regulations.
18
+
19
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
20
+ # their software and algorithms, including trained model weights, parameters (including
21
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
22
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
23
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
24
+
25
+
26
+ class RunningStats():
27
+ def __init__(self) -> None:
28
+ self.count = 0
29
+ self.sum = 0
30
+ self.mean = 0
31
+ self.min = None
32
+ self.max = None
33
+
34
+ def add_value(self, value):
35
+ self.count += 1
36
+ self.sum += value
37
+ self.mean = self.sum / self.count
38
+
39
+ if self.min is None or value < self.min:
40
+ self.min = value
41
+
42
+ if self.max is None or value > self.max:
43
+ self.max = value
44
+
45
+ def get_count(self):
46
+ return self.count
47
+
48
+ def get_sum(self):
49
+ return self.sum
50
+
51
+ def get_mean(self):
52
+ return self.mean
53
+
54
+ def get_min(self):
55
+ return self.min
56
+
57
+ def get_max(self):
58
+ return self.max
build/lib/hy3dgen/texgen/utils/dehighlight_utils.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source Model Licensed under the Apache License Version 2.0
2
+ # and Other Licenses of the Third-Party Components therein:
3
+ # The below Model in this distribution may have been modified by THL A29 Limited
4
+ # ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
5
+
6
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
7
+ # The below software and/or models in this distribution may have been
8
+ # modified by THL A29 Limited ("Tencent Modifications").
9
+ # All Tencent Modifications are Copyright (C) THL A29 Limited.
10
+
11
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
12
+ # except for the third-party components listed below.
13
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
14
+ # in the repsective licenses of these third-party components.
15
+ # Users must comply with all terms and conditions of original licenses of these third-party
16
+ # components and must ensure that the usage of the third party components adheres to
17
+ # all relevant laws and regulations.
18
+
19
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
20
+ # their software and algorithms, including trained model weights, parameters (including
21
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
22
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
23
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
24
+
25
+ import cv2
26
+ import numpy as np
27
+ import torch
28
+ from PIL import Image
29
+ from diffusers import StableDiffusionInstructPix2PixPipeline, EulerAncestralDiscreteScheduler
30
+
31
+
32
+ class Light_Shadow_Remover():
33
+ def __init__(self, config):
34
+ self.device = config.device
35
+ self.cfg_image = 1.5
36
+ self.cfg_text = 1.0
37
+
38
+ pipeline = StableDiffusionInstructPix2PixPipeline.from_pretrained(
39
+ config.light_remover_ckpt_path,
40
+ torch_dtype=torch.float16,
41
+ safety_checker=None,
42
+ )
43
+ pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(pipeline.scheduler.config)
44
+ pipeline.set_progress_bar_config(disable=True)
45
+
46
+ # self.pipeline = pipeline.to(self.device, torch.float16)
47
+ self.pipeline = pipeline # Needed to avoid displaying the warning
48
+ @torch.no_grad()
49
+ def __call__(self, image):
50
+
51
+ image = image.resize((512, 512))
52
+
53
+ if image.mode == 'RGBA':
54
+ image_array = np.array(image)
55
+ alpha_channel = image_array[:, :, 3]
56
+ erosion_size = 3
57
+ kernel = np.ones((erosion_size, erosion_size), np.uint8)
58
+ alpha_channel = cv2.erode(alpha_channel, kernel, iterations=1)
59
+ image_array[alpha_channel == 0, :3] = 255
60
+ image_array[:, :, 3] = alpha_channel
61
+ image = Image.fromarray(image_array)
62
+
63
+ image_tensor = torch.tensor(np.array(image) / 255.0).to(self.device)
64
+ alpha = image_tensor[:, :, 3:]
65
+ rgb_target = image_tensor[:, :, :3]
66
+ else:
67
+ image_tensor = torch.tensor(np.array(image) / 255.0).to(self.device)
68
+ alpha = torch.ones_like(image_tensor)[:, :, :1]
69
+ rgb_target = image_tensor[:, :, :3]
70
+
71
+ image = image.convert('RGB')
72
+
73
+ image = self.pipeline(
74
+ prompt="",
75
+ image=image,
76
+ generator=torch.manual_seed(42),
77
+ height=512,
78
+ width=512,
79
+ num_inference_steps=50,
80
+ image_guidance_scale=self.cfg_image,
81
+ guidance_scale=self.cfg_text,
82
+ ).images[0]
83
+
84
+ return image
build/lib/hy3dgen/texgen/utils/multiview_utils.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source Model Licensed under the Apache License Version 2.0
2
+ # and Other Licenses of the Third-Party Components therein:
3
+ # The below Model in this distribution may have been modified by THL A29 Limited
4
+ # ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
5
+
6
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
7
+ # The below software and/or models in this distribution may have been
8
+ # modified by THL A29 Limited ("Tencent Modifications").
9
+ # All Tencent Modifications are Copyright (C) THL A29 Limited.
10
+
11
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
12
+ # except for the third-party components listed below.
13
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
14
+ # in the repsective licenses of these third-party components.
15
+ # Users must comply with all terms and conditions of original licenses of these third-party
16
+ # components and must ensure that the usage of the third party components adheres to
17
+ # all relevant laws and regulations.
18
+
19
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
20
+ # their software and algorithms, including trained model weights, parameters (including
21
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
22
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
23
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
24
+
25
+ import os
26
+ import random
27
+
28
+ import numpy as np
29
+ import torch
30
+ from diffusers import DiffusionPipeline
31
+ from diffusers import EulerAncestralDiscreteScheduler
32
+
33
+
34
+ class Multiview_Diffusion_Net():
35
+ def __init__(self, config) -> None:
36
+ self.device = config.device
37
+ self.view_size = 512
38
+ multiview_ckpt_path = config.multiview_ckpt_path
39
+
40
+ current_file_path = os.path.abspath(__file__)
41
+ custom_pipeline_path = os.path.join(os.path.dirname(current_file_path), '..', 'hunyuanpaint')
42
+
43
+ pipeline = DiffusionPipeline.from_pretrained(
44
+ multiview_ckpt_path,
45
+ custom_pipeline=custom_pipeline_path, torch_dtype=torch.float16)
46
+
47
+ pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(pipeline.scheduler.config,
48
+ timestep_spacing='trailing')
49
+
50
+ pipeline.set_progress_bar_config(disable=True)
51
+ self.pipeline = pipeline #.to(self.device) # only for cosmetics and not display the warning
52
+
53
+ def seed_everything(self, seed):
54
+ random.seed(seed)
55
+ np.random.seed(seed)
56
+ torch.manual_seed(seed)
57
+ os.environ["PL_GLOBAL_SEED"] = str(seed)
58
+
59
+ def __call__(self, input_image, control_images, camera_info):
60
+
61
+ self.seed_everything(0)
62
+
63
+ input_image = input_image.resize((self.view_size, self.view_size))
64
+ for i in range(len(control_images)):
65
+ control_images[i] = control_images[i].resize((self.view_size, self.view_size))
66
+ if control_images[i].mode == 'L':
67
+ control_images[i] = control_images[i].point(lambda x: 255 if x > 1 else 0, mode='1')
68
+
69
+ kwargs = dict(generator=torch.Generator(device=self.pipeline.device).manual_seed(0))
70
+
71
+ num_view = len(control_images) // 2
72
+ normal_image = [[control_images[i] for i in range(num_view)]]
73
+ position_image = [[control_images[i + num_view] for i in range(num_view)]]
74
+
75
+ camera_info_gen = [camera_info]
76
+ camera_info_ref = [[0]]
77
+ kwargs['width'] = self.view_size
78
+ kwargs['height'] = self.view_size
79
+ kwargs['num_in_batch'] = num_view
80
+ kwargs['camera_info_gen'] = camera_info_gen
81
+ kwargs['camera_info_ref'] = camera_info_ref
82
+ kwargs["normal_imgs"] = normal_image
83
+ kwargs["position_imgs"] = position_image
84
+
85
+ mvd_image = self.pipeline(input_image, num_inference_steps=30, **kwargs).images
86
+ return mvd_image
build/lib/hy3dgen/texgen/utils/simplify_mesh_utils.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source Model Licensed under the Apache License Version 2.0
2
+ # and Other Licenses of the Third-Party Components therein:
3
+ # The below Model in this distribution may have been modified by THL A29 Limited
4
+ # ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
5
+
6
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
7
+ # The below software and/or models in this distribution may have been
8
+ # modified by THL A29 Limited ("Tencent Modifications").
9
+ # All Tencent Modifications are Copyright (C) THL A29 Limited.
10
+
11
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
12
+ # except for the third-party components listed below.
13
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
14
+ # in the repsective licenses of these third-party components.
15
+ # Users must comply with all terms and conditions of original licenses of these third-party
16
+ # components and must ensure that the usage of the third party components adheres to
17
+ # all relevant laws and regulations.
18
+
19
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
20
+ # their software and algorithms, including trained model weights, parameters (including
21
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
22
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
23
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
24
+
25
+ import trimesh
26
+
27
+
28
+ def remesh_mesh(mesh_path, remesh_path, method='trimesh'):
29
+ if method == 'trimesh':
30
+ mesh_simplify_trimesh(mesh_path, remesh_path)
31
+ else:
32
+ raise f'Method {method} has not been implemented.'
33
+
34
+
35
+ def mesh_simplify_trimesh(inputpath, outputpath):
36
+ import pymeshlab
37
+ ms = pymeshlab.MeshSet()
38
+ ms.load_new_mesh(inputpath, load_in_a_single_layer=True)
39
+ ms.save_current_mesh(outputpath.replace('.glb', '.obj'), save_textures=False)
40
+
41
+ courent = trimesh.load(outputpath.replace('.glb', '.obj'), force='mesh')
42
+ face_num = courent.faces.shape[0]
43
+
44
+ if face_num > 100000:
45
+ courent = courent.simplify_quadric_decimation(40000)
46
+ courent.export(outputpath)
build/lib/hy3dgen/texgen/utils/uv_warp_utils.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source Model Licensed under the Apache License Version 2.0
2
+ # and Other Licenses of the Third-Party Components therein:
3
+ # The below Model in this distribution may have been modified by THL A29 Limited
4
+ # ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
5
+
6
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
7
+ # The below software and/or models in this distribution may have been
8
+ # modified by THL A29 Limited ("Tencent Modifications").
9
+ # All Tencent Modifications are Copyright (C) THL A29 Limited.
10
+
11
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
12
+ # except for the third-party components listed below.
13
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
14
+ # in the repsective licenses of these third-party components.
15
+ # Users must comply with all terms and conditions of original licenses of these third-party
16
+ # components and must ensure that the usage of the third party components adheres to
17
+ # all relevant laws and regulations.
18
+
19
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
20
+ # their software and algorithms, including trained model weights, parameters (including
21
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
22
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
23
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
24
+
25
+ import trimesh
26
+ import xatlas
27
+
28
+
29
+ def mesh_uv_wrap(mesh):
30
+ if isinstance(mesh, trimesh.Scene):
31
+ mesh = mesh.dump(concatenate=True)
32
+
33
+ # if len(mesh.faces) > 50000:
34
+ # raise ValueError("The mesh has more than 50,000 faces, which is not supported.")
35
+
36
+ vmapping, indices, uvs = xatlas.parametrize(mesh.vertices, mesh.faces)
37
+
38
+ mesh.vertices = mesh.vertices[vmapping]
39
+ mesh.faces = indices
40
+ mesh.visual.uv = uvs
41
+
42
+ return mesh
build/lib/hy3dgen/text2image.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source Model Licensed under the Apache License Version 2.0
2
+ # and Other Licenses of the Third-Party Components therein:
3
+ # The below Model in this distribution may have been modified by THL A29 Limited
4
+ # ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
5
+
6
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
7
+ # The below software and/or models in this distribution may have been
8
+ # modified by THL A29 Limited ("Tencent Modifications").
9
+ # All Tencent Modifications are Copyright (C) THL A29 Limited.
10
+
11
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
12
+ # except for the third-party components listed below.
13
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
14
+ # in the repsective licenses of these third-party components.
15
+ # Users must comply with all terms and conditions of original licenses of these third-party
16
+ # components and must ensure that the usage of the third party components adheres to
17
+ # all relevant laws and regulations.
18
+
19
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
20
+ # their software and algorithms, including trained model weights, parameters (including
21
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
22
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
23
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
24
+
25
+
26
+ import os
27
+ import random
28
+
29
+ import numpy as np
30
+ import torch
31
+ from diffusers import AutoPipelineForText2Image
32
+
33
+
34
+ def seed_everything(seed):
35
+ random.seed(seed)
36
+ np.random.seed(seed)
37
+ torch.manual_seed(seed)
38
+ os.environ["PL_GLOBAL_SEED"] = str(seed)
39
+
40
+
41
+ class HunyuanDiTPipeline:
42
+ def __init__(
43
+ self,
44
+ model_path="Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers-Distilled",
45
+ device='cpu'
46
+ ):
47
+ torch.set_default_device('cpu')
48
+ self.device = device
49
+ self.pipe = AutoPipelineForText2Image.from_pretrained(
50
+ model_path,
51
+ torch_dtype=torch.float16,
52
+ enable_pag=True,
53
+ pag_applied_layers=["blocks.(16|17|18|19)"]
54
+ ) # .to(device) # needed to avoid displaying the warning
55
+ self.pos_txt = ",白色背景,3D风格,最佳质量"
56
+ self.neg_txt = "文本,特写,裁剪,出框,最差质量,低质量,JPEG伪影,PGLY,重复,病态," \
57
+ "残缺,多余的手指,变异的手,画得不好的手,画得不好的脸,变异,畸形,模糊,脱水,糟糕的解剖学," \
58
+ "糟糕的比例,多余的肢体,克隆的脸,毁容,恶心的比例,畸形的肢体,缺失的手臂,缺失的腿," \
59
+ "额外的手臂,额外的腿,融合的手指,手指太多,长脖子"
60
+
61
+ def compile(self):
62
+ # accelarate hunyuan-dit transformer,first inference will cost long time
63
+ torch.set_float32_matmul_precision('high')
64
+ self.pipe.transformer = torch.compile(self.pipe.transformer, fullgraph=True)
65
+ # self.pipe.vae.decode = torch.compile(self.pipe.vae.decode, fullgraph=True)
66
+ generator = torch.Generator(device=self.pipe.device) # infer once for hot-start
67
+ out_img = self.pipe(
68
+ prompt='美少女战士',
69
+ negative_prompt='模糊',
70
+ num_inference_steps=25,
71
+ pag_scale=1.3,
72
+ width=1024,
73
+ height=1024,
74
+ generator=generator,
75
+ return_dict=False
76
+ )[0][0]
77
+
78
+ @torch.no_grad()
79
+ def __call__(self, prompt, seed=0):
80
+ seed_everything(seed)
81
+ generator = torch.Generator(device="cuda") #self.pipe.device
82
+ generator = generator.manual_seed(int(seed))
83
+ out_img = self.pipe(
84
+ prompt=self.pos_txt+prompt,
85
+ negative_prompt=self.neg_txt,
86
+ num_inference_steps=20,
87
+ pag_scale=1.3,
88
+ width=1024,
89
+ height=1024,
90
+ generator=generator,
91
+ return_dict=False
92
+ )[0][0]
93
+ return out_img
dist/hy3dgen-2.0.0-py3.12.egg ADDED
Binary file (189 kB). View file
 
hy3dgen.egg-info/PKG-INFO ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Metadata-Version: 2.2
2
+ Name: hy3dgen
3
+ Version: 2.0.0
hy3dgen.egg-info/SOURCES.txt ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ README.md
2
+ setup.py
3
+ hy3dgen/__init__.py
4
+ hy3dgen/rembg.py
5
+ hy3dgen/text2image.py
6
+ hy3dgen.egg-info/PKG-INFO
7
+ hy3dgen.egg-info/SOURCES.txt
8
+ hy3dgen.egg-info/dependency_links.txt
9
+ hy3dgen.egg-info/top_level.txt
10
+ hy3dgen/shapegen/__init__.py
11
+ hy3dgen/shapegen/pipelines.py
12
+ hy3dgen/shapegen/postprocessors.py
13
+ hy3dgen/shapegen/preprocessors.py
14
+ hy3dgen/shapegen/schedulers.py
15
+ hy3dgen/shapegen/models/__init__.py
16
+ hy3dgen/shapegen/models/conditioner.py
17
+ hy3dgen/shapegen/models/hunyuan3ddit.py
18
+ hy3dgen/shapegen/models/vae.py
19
+ hy3dgen/texgen/__init__.py
20
+ hy3dgen/texgen/pipelines.py
21
+ hy3dgen/texgen/differentiable_renderer/__init__.py
22
+ hy3dgen/texgen/differentiable_renderer/camera_utils.py
23
+ hy3dgen/texgen/differentiable_renderer/mesh_processor.py
24
+ hy3dgen/texgen/differentiable_renderer/mesh_render.py
25
+ hy3dgen/texgen/differentiable_renderer/mesh_utils.py
26
+ hy3dgen/texgen/differentiable_renderer/setup.py
27
+ hy3dgen/texgen/hunyuanpaint/__init__.py
28
+ hy3dgen/texgen/hunyuanpaint/pipeline.py
29
+ hy3dgen/texgen/hunyuanpaint/unet/__init__.py
30
+ hy3dgen/texgen/hunyuanpaint/unet/modules.py
31
+ hy3dgen/texgen/utils/__init__.py
32
+ hy3dgen/texgen/utils/alignImg4Tex_utils.py
33
+ hy3dgen/texgen/utils/counter_utils.py
34
+ hy3dgen/texgen/utils/dehighlight_utils.py
35
+ hy3dgen/texgen/utils/multiview_utils.py
36
+ hy3dgen/texgen/utils/simplify_mesh_utils.py
37
+ hy3dgen/texgen/utils/uv_warp_utils.py
hy3dgen.egg-info/dependency_links.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
hy3dgen.egg-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ hy3dgen
hy3dgen/__init__.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source Model Licensed under the Apache License Version 2.0
2
+ # and Other Licenses of the Third-Party Components therein:
3
+ # The below Model in this distribution may have been modified by THL A29 Limited
4
+ # ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
5
+
6
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
7
+ # The below software and/or models in this distribution may have been
8
+ # modified by THL A29 Limited ("Tencent Modifications").
9
+ # All Tencent Modifications are Copyright (C) THL A29 Limited.
10
+
11
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
12
+ # except for the third-party components listed below.
13
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
14
+ # in the repsective licenses of these third-party components.
15
+ # Users must comply with all terms and conditions of original licenses of these third-party
16
+ # components and must ensure that the usage of the third party components adheres to
17
+ # all relevant laws and regulations.
18
+
19
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
20
+ # their software and algorithms, including trained model weights, parameters (including
21
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
22
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
23
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
hy3dgen/rembg.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source Model Licensed under the Apache License Version 2.0
2
+ # and Other Licenses of the Third-Party Components therein:
3
+ # The below Model in this distribution may have been modified by THL A29 Limited
4
+ # ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
5
+
6
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
7
+ # The below software and/or models in this distribution may have been
8
+ # modified by THL A29 Limited ("Tencent Modifications").
9
+ # All Tencent Modifications are Copyright (C) THL A29 Limited.
10
+
11
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
12
+ # except for the third-party components listed below.
13
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
14
+ # in the repsective licenses of these third-party components.
15
+ # Users must comply with all terms and conditions of original licenses of these third-party
16
+ # components and must ensure that the usage of the third party components adheres to
17
+ # all relevant laws and regulations.
18
+
19
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
20
+ # their software and algorithms, including trained model weights, parameters (including
21
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
22
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
23
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
24
+
25
+
26
+ from PIL import Image
27
+ from rembg import remove, new_session
28
+
29
+
30
+ class BackgroundRemover():
31
+ def __init__(self):
32
+ self.session = new_session()
33
+
34
+ def __call__(self, image: Image.Image):
35
+ output = remove(image, session=self.session, bgcolor=[255, 255, 255, 0])
36
+ return output
hy3dgen/shapegen/__init__.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source Model Licensed under the Apache License Version 2.0
2
+ # and Other Licenses of the Third-Party Components therein:
3
+ # The below Model in this distribution may have been modified by THL A29 Limited
4
+ # ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
5
+
6
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
7
+ # The below software and/or models in this distribution may have been
8
+ # modified by THL A29 Limited ("Tencent Modifications").
9
+ # All Tencent Modifications are Copyright (C) THL A29 Limited.
10
+
11
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
12
+ # except for the third-party components listed below.
13
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
14
+ # in the repsective licenses of these third-party components.
15
+ # Users must comply with all terms and conditions of original licenses of these third-party
16
+ # components and must ensure that the usage of the third party components adheres to
17
+ # all relevant laws and regulations.
18
+
19
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
20
+ # their software and algorithms, including trained model weights, parameters (including
21
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
22
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
23
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
24
+
25
+ from .pipelines import Hunyuan3DDiTPipeline, Hunyuan3DDiTFlowMatchingPipeline
26
+ from .postprocessors import FaceReducer, FloaterRemover, DegenerateFaceRemover
27
+ from .preprocessors import ImageProcessorV2, IMAGE_PROCESSORS, DEFAULT_IMAGEPROCESSOR
hy3dgen/shapegen/models/__init__.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source Model Licensed under the Apache License Version 2.0
2
+ # and Other Licenses of the Third-Party Components therein:
3
+ # The below Model in this distribution may have been modified by THL A29 Limited
4
+ # ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
5
+
6
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
7
+ # The below software and/or models in this distribution may have been
8
+ # modified by THL A29 Limited ("Tencent Modifications").
9
+ # All Tencent Modifications are Copyright (C) THL A29 Limited.
10
+
11
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
12
+ # except for the third-party components listed below.
13
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
14
+ # in the repsective licenses of these third-party components.
15
+ # Users must comply with all terms and conditions of original licenses of these third-party
16
+ # components and must ensure that the usage of the third party components adheres to
17
+ # all relevant laws and regulations.
18
+
19
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
20
+ # their software and algorithms, including trained model weights, parameters (including
21
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
22
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
23
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
24
+
25
+
26
+ from .conditioner import DualImageEncoder, SingleImageEncoder, DinoImageEncoder, CLIPImageEncoder
27
+ from .hunyuan3ddit import Hunyuan3DDiT
28
+ from .vae import ShapeVAE
hy3dgen/shapegen/models/conditioner.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source Model Licensed under the Apache License Version 2.0
2
+ # and Other Licenses of the Third-Party Components therein:
3
+ # The below Model in this distribution may have been modified by THL A29 Limited
4
+ # ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
5
+
6
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
7
+ # The below software and/or models in this distribution may have been
8
+ # modified by THL A29 Limited ("Tencent Modifications").
9
+ # All Tencent Modifications are Copyright (C) THL A29 Limited.
10
+
11
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
12
+ # except for the third-party components listed below.
13
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
14
+ # in the repsective licenses of these third-party components.
15
+ # Users must comply with all terms and conditions of original licenses of these third-party
16
+ # components and must ensure that the usage of the third party components adheres to
17
+ # all relevant laws and regulations.
18
+
19
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
20
+ # their software and algorithms, including trained model weights, parameters (including
21
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
22
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
23
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
24
+
25
+ import torch
26
+ import torch.nn as nn
27
+ from torchvision import transforms
28
+ from transformers import (
29
+ CLIPVisionModelWithProjection,
30
+ CLIPVisionConfig,
31
+ Dinov2Model,
32
+ Dinov2Config,
33
+ )
34
+
35
+
36
+ class ImageEncoder(nn.Module):
37
+ def __init__(
38
+ self,
39
+ version=None,
40
+ config=None,
41
+ use_cls_token=True,
42
+ image_size=224,
43
+ **kwargs,
44
+ ):
45
+ super().__init__()
46
+
47
+ if config is None:
48
+ self.model = self.MODEL_CLASS.from_pretrained(version)
49
+ else:
50
+ self.model = self.MODEL_CLASS(self.MODEL_CONFIG_CLASS.from_dict(config))
51
+ self.model.eval()
52
+ self.model.requires_grad_(False)
53
+ self.use_cls_token = use_cls_token
54
+ self.size = image_size // 14
55
+ self.num_patches = (image_size // 14) ** 2
56
+ if self.use_cls_token:
57
+ self.num_patches += 1
58
+
59
+ self.transform = transforms.Compose(
60
+ [
61
+ transforms.Resize(image_size, transforms.InterpolationMode.BILINEAR, antialias=True),
62
+ transforms.CenterCrop(image_size),
63
+ transforms.Normalize(
64
+ mean=self.mean,
65
+ std=self.std,
66
+ ),
67
+ ]
68
+ )
69
+
70
+ def forward(self, image, mask=None, value_range=(-1, 1)):
71
+ if value_range is not None:
72
+ low, high = value_range
73
+ image = (image - low) / (high - low)
74
+
75
+ image = image.to(self.model.device, dtype=self.model.dtype)
76
+ inputs = self.transform(image)
77
+ outputs = self.model(inputs)
78
+
79
+ last_hidden_state = outputs.last_hidden_state
80
+ if not self.use_cls_token:
81
+ last_hidden_state = last_hidden_state[:, 1:, :]
82
+
83
+ return last_hidden_state
84
+
85
+ def unconditional_embedding(self, batch_size):
86
+ device = next(self.model.parameters()).device
87
+ dtype = next(self.model.parameters()).dtype
88
+ zero = torch.zeros(
89
+ batch_size,
90
+ self.num_patches,
91
+ self.model.config.hidden_size,
92
+ device=device,
93
+ dtype=dtype,
94
+ )
95
+
96
+ return zero
97
+
98
+
99
+ class CLIPImageEncoder(ImageEncoder):
100
+ MODEL_CLASS = CLIPVisionModelWithProjection
101
+ MODEL_CONFIG_CLASS = CLIPVisionConfig
102
+ mean = [0.48145466, 0.4578275, 0.40821073]
103
+ std = [0.26862954, 0.26130258, 0.27577711]
104
+
105
+
106
+ class DinoImageEncoder(ImageEncoder):
107
+ MODEL_CLASS = Dinov2Model
108
+ MODEL_CONFIG_CLASS = Dinov2Config
109
+ mean = [0.485, 0.456, 0.406]
110
+ std = [0.229, 0.224, 0.225]
111
+
112
+
113
+ def build_image_encoder(config):
114
+ if config['type'] == 'CLIPImageEncoder':
115
+ return CLIPImageEncoder(**config['kwargs'])
116
+ elif config['type'] == 'DinoImageEncoder':
117
+ return DinoImageEncoder(**config['kwargs'])
118
+ else:
119
+ raise ValueError(f'Unknown image encoder type: {config["type"]}')
120
+
121
+
122
+ class DualImageEncoder(nn.Module):
123
+ def __init__(
124
+ self,
125
+ main_image_encoder,
126
+ additional_image_encoder,
127
+ ):
128
+ super().__init__()
129
+ self.main_image_encoder = build_image_encoder(main_image_encoder)
130
+ self.additional_image_encoder = build_image_encoder(additional_image_encoder)
131
+
132
+ def forward(self, image, mask=None):
133
+ outputs = {
134
+ 'main': self.main_image_encoder(image, mask=mask),
135
+ 'additional': self.additional_image_encoder(image, mask=mask),
136
+ }
137
+ return outputs
138
+
139
+ def unconditional_embedding(self, batch_size):
140
+ outputs = {
141
+ 'main': self.main_image_encoder.unconditional_embedding(batch_size),
142
+ 'additional': self.additional_image_encoder.unconditional_embedding(batch_size),
143
+ }
144
+ return outputs
145
+
146
+
147
+ class SingleImageEncoder(nn.Module):
148
+ def __init__(
149
+ self,
150
+ main_image_encoder,
151
+ ):
152
+ super().__init__()
153
+ self.main_image_encoder = build_image_encoder(main_image_encoder)
154
+
155
+ def forward(self, image, mask=None):
156
+ outputs = {
157
+ 'main': self.main_image_encoder(image, mask=mask),
158
+ }
159
+ return outputs
160
+
161
+ def unconditional_embedding(self, batch_size):
162
+ outputs = {
163
+ 'main': self.main_image_encoder.unconditional_embedding(batch_size),
164
+ }
165
+ return outputs
hy3dgen/shapegen/models/hunyuan3ddit.py ADDED
@@ -0,0 +1,390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source Model Licensed under the Apache License Version 2.0
2
+ # and Other Licenses of the Third-Party Components therein:
3
+ # The below Model in this distribution may have been modified by THL A29 Limited
4
+ # ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
5
+
6
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
7
+ # The below software and/or models in this distribution may have been
8
+ # modified by THL A29 Limited ("Tencent Modifications").
9
+ # All Tencent Modifications are Copyright (C) THL A29 Limited.
10
+
11
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
12
+ # except for the third-party components listed below.
13
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
14
+ # in the repsective licenses of these third-party components.
15
+ # Users must comply with all terms and conditions of original licenses of these third-party
16
+ # components and must ensure that the usage of the third party components adheres to
17
+ # all relevant laws and regulations.
18
+
19
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
20
+ # their software and algorithms, including trained model weights, parameters (including
21
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
22
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
23
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
24
+
25
+ import math
26
+ from dataclasses import dataclass
27
+ from typing import List, Tuple, Optional
28
+
29
+ import torch
30
+ from einops import rearrange
31
+ from torch import Tensor, nn
32
+
33
+
34
+ def attention(q: Tensor, k: Tensor, v: Tensor, **kwargs) -> Tensor:
35
+ x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
36
+ x = rearrange(x, "B H L D -> B L (H D)")
37
+ return x
38
+
39
+
40
+ def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 1000.0):
41
+ """
42
+ Create sinusoidal timestep embeddings.
43
+ :param t: a 1-D Tensor of N indices, one per batch element.
44
+ These may be fractional.
45
+ :param dim: the dimension of the output.
46
+ :param max_period: controls the minimum frequency of the embeddings.
47
+ :return: an (N, D) Tensor of positional embeddings.
48
+ """
49
+ t = time_factor * t
50
+ half = dim // 2
51
+ freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(
52
+ t.device
53
+ )
54
+
55
+ args = t[:, None].float() * freqs[None]
56
+ embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
57
+ if dim % 2:
58
+ embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
59
+ if torch.is_floating_point(t):
60
+ embedding = embedding.to(t)
61
+ return embedding
62
+
63
+
64
+ class MLPEmbedder(nn.Module):
65
+ def __init__(self, in_dim: int, hidden_dim: int):
66
+ super().__init__()
67
+ self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
68
+ self.silu = nn.SiLU()
69
+ self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
70
+
71
+ def forward(self, x: Tensor) -> Tensor:
72
+ return self.out_layer(self.silu(self.in_layer(x)))
73
+
74
+
75
+ class RMSNorm(torch.nn.Module):
76
+ def __init__(self, dim: int):
77
+ super().__init__()
78
+ self.scale = nn.Parameter(torch.ones(dim))
79
+
80
+ def forward(self, x: Tensor):
81
+ x_dtype = x.dtype
82
+ x = x.float()
83
+ rrms = torch.rsqrt(torch.mean(x ** 2, dim=-1, keepdim=True) + 1e-6)
84
+ return (x * rrms).to(dtype=x_dtype) * self.scale
85
+
86
+
87
+ class QKNorm(torch.nn.Module):
88
+ def __init__(self, dim: int):
89
+ super().__init__()
90
+ self.query_norm = RMSNorm(dim)
91
+ self.key_norm = RMSNorm(dim)
92
+
93
+ def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tuple[Tensor, Tensor]:
94
+ q = self.query_norm(q)
95
+ k = self.key_norm(k)
96
+ return q.to(v), k.to(v)
97
+
98
+
99
+ class SelfAttention(nn.Module):
100
+ def __init__(
101
+ self,
102
+ dim: int,
103
+ num_heads: int = 8,
104
+ qkv_bias: bool = False,
105
+ ):
106
+ super().__init__()
107
+ self.num_heads = num_heads
108
+ head_dim = dim // num_heads
109
+
110
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
111
+ self.norm = QKNorm(head_dim)
112
+ self.proj = nn.Linear(dim, dim)
113
+
114
+ def forward(self, x: Tensor, pe: Tensor) -> Tensor:
115
+ qkv = self.qkv(x)
116
+ q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
117
+ q, k = self.norm(q, k, v)
118
+ x = attention(q, k, v, pe=pe)
119
+ x = self.proj(x)
120
+ return x
121
+
122
+
123
+ @dataclass
124
+ class ModulationOut:
125
+ shift: Tensor
126
+ scale: Tensor
127
+ gate: Tensor
128
+
129
+
130
+ class Modulation(nn.Module):
131
+ def __init__(self, dim: int, double: bool):
132
+ super().__init__()
133
+ self.is_double = double
134
+ self.multiplier = 6 if double else 3
135
+ self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)
136
+
137
+ def forward(self, vec: Tensor) -> Tuple[ModulationOut, Optional[ModulationOut]]:
138
+ out = self.lin(nn.functional.silu(vec))[:, None, :]
139
+ out = out.chunk(self.multiplier, dim=-1)
140
+
141
+ return (
142
+ ModulationOut(*out[:3]),
143
+ ModulationOut(*out[3:]) if self.is_double else None,
144
+ )
145
+
146
+
147
+ class DoubleStreamBlock(nn.Module):
148
+ def __init__(
149
+ self,
150
+ hidden_size: int,
151
+ num_heads: int,
152
+ mlp_ratio: float,
153
+ qkv_bias: bool = False,
154
+ ):
155
+ super().__init__()
156
+ mlp_hidden_dim = int(hidden_size * mlp_ratio)
157
+ self.num_heads = num_heads
158
+ self.hidden_size = hidden_size
159
+ self.img_mod = Modulation(hidden_size, double=True)
160
+ self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
161
+ self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
162
+
163
+ self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
164
+ self.img_mlp = nn.Sequential(
165
+ nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
166
+ nn.GELU(approximate="tanh"),
167
+ nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
168
+ )
169
+
170
+ self.txt_mod = Modulation(hidden_size, double=True)
171
+ self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
172
+ self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
173
+
174
+ self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
175
+ self.txt_mlp = nn.Sequential(
176
+ nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
177
+ nn.GELU(approximate="tanh"),
178
+ nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
179
+ )
180
+
181
+ def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor) -> Tuple[Tensor, Tensor]:
182
+ img_mod1, img_mod2 = self.img_mod(vec)
183
+ txt_mod1, txt_mod2 = self.txt_mod(vec)
184
+
185
+ img_modulated = self.img_norm1(img)
186
+ img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
187
+ img_qkv = self.img_attn.qkv(img_modulated)
188
+ img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
189
+ img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
190
+
191
+ txt_modulated = self.txt_norm1(txt)
192
+ txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
193
+ txt_qkv = self.txt_attn.qkv(txt_modulated)
194
+ txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
195
+ txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
196
+
197
+ q = torch.cat((txt_q, img_q), dim=2)
198
+ k = torch.cat((txt_k, img_k), dim=2)
199
+ v = torch.cat((txt_v, img_v), dim=2)
200
+
201
+ attn = attention(q, k, v, pe=pe)
202
+ txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1]:]
203
+
204
+ img = img + img_mod1.gate * self.img_attn.proj(img_attn)
205
+ img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)
206
+
207
+ txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
208
+ txt = txt + txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
209
+ return img, txt
210
+
211
+
212
+ class SingleStreamBlock(nn.Module):
213
+ """
214
+ A DiT block with parallel linear layers as described in
215
+ https://arxiv.org/abs/2302.05442 and adapted modulation interface.
216
+ """
217
+
218
+ def __init__(
219
+ self,
220
+ hidden_size: int,
221
+ num_heads: int,
222
+ mlp_ratio: float = 4.0,
223
+ qk_scale: Optional[float] = None,
224
+ ):
225
+ super().__init__()
226
+
227
+ self.hidden_dim = hidden_size
228
+ self.num_heads = num_heads
229
+ head_dim = hidden_size // num_heads
230
+ self.scale = qk_scale or head_dim ** -0.5
231
+
232
+ self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
233
+ # qkv and mlp_in
234
+ self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim)
235
+ # proj and mlp_out
236
+ self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
237
+
238
+ self.norm = QKNorm(head_dim)
239
+
240
+ self.hidden_size = hidden_size
241
+ self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
242
+
243
+ self.mlp_act = nn.GELU(approximate="tanh")
244
+ self.modulation = Modulation(hidden_size, double=False)
245
+
246
+ def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
247
+ mod, _ = self.modulation(vec)
248
+
249
+ x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
250
+ qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
251
+
252
+ q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
253
+ q, k = self.norm(q, k, v)
254
+
255
+ # compute attention
256
+ attn = attention(q, k, v, pe=pe)
257
+ # compute activation in mlp stream, cat again and run second linear layer
258
+ output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
259
+ return x + mod.gate * output
260
+
261
+
262
+ class LastLayer(nn.Module):
263
+ def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
264
+ super().__init__()
265
+ self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
266
+ self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
267
+ self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True))
268
+
269
+ def forward(self, x: Tensor, vec: Tensor) -> Tensor:
270
+ shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
271
+ x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
272
+ x = self.linear(x)
273
+ return x
274
+
275
+
276
+ class Hunyuan3DDiT(nn.Module):
277
+ def __init__(
278
+ self,
279
+ in_channels: int = 64,
280
+ context_in_dim: int = 1536,
281
+ hidden_size: int = 1024,
282
+ mlp_ratio: float = 4.0,
283
+ num_heads: int = 16,
284
+ depth: int = 16,
285
+ depth_single_blocks: int = 32,
286
+ axes_dim: List[int] = [64],
287
+ theta: int = 10_000,
288
+ qkv_bias: bool = True,
289
+ time_factor: float = 1000,
290
+ ckpt_path: Optional[str] = None,
291
+ **kwargs,
292
+ ):
293
+ super().__init__()
294
+ self.in_channels = in_channels
295
+ self.context_in_dim = context_in_dim
296
+ self.hidden_size = hidden_size
297
+ self.mlp_ratio = mlp_ratio
298
+ self.num_heads = num_heads
299
+ self.depth = depth
300
+ self.depth_single_blocks = depth_single_blocks
301
+ self.axes_dim = axes_dim
302
+ self.theta = theta
303
+ self.qkv_bias = qkv_bias
304
+ self.time_factor = time_factor
305
+ self.out_channels = self.in_channels
306
+
307
+ if hidden_size % num_heads != 0:
308
+ raise ValueError(
309
+ f"Hidden size {hidden_size} must be divisible by num_heads {num_heads}"
310
+ )
311
+ pe_dim = hidden_size // num_heads
312
+ if sum(axes_dim) != pe_dim:
313
+ raise ValueError(f"Got {axes_dim} but expected positional dim {pe_dim}")
314
+ self.hidden_size = hidden_size
315
+ self.num_heads = num_heads
316
+ self.latent_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
317
+ self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
318
+ self.cond_in = nn.Linear(context_in_dim, self.hidden_size)
319
+
320
+ self.double_blocks = nn.ModuleList(
321
+ [
322
+ DoubleStreamBlock(
323
+ self.hidden_size,
324
+ self.num_heads,
325
+ mlp_ratio=mlp_ratio,
326
+ qkv_bias=qkv_bias,
327
+ )
328
+ for _ in range(depth)
329
+ ]
330
+ )
331
+
332
+ self.single_blocks = nn.ModuleList(
333
+ [
334
+ SingleStreamBlock(
335
+ self.hidden_size,
336
+ self.num_heads,
337
+ mlp_ratio=mlp_ratio,
338
+ )
339
+ for _ in range(depth_single_blocks)
340
+ ]
341
+ )
342
+
343
+ self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
344
+
345
+ if ckpt_path is not None:
346
+ print('restored denoiser ckpt', ckpt_path)
347
+
348
+ ckpt = torch.load(ckpt_path, map_location="cpu")
349
+ if 'state_dict' not in ckpt:
350
+ # deepspeed ckpt
351
+ state_dict = {}
352
+ for k in ckpt.keys():
353
+ new_k = k.replace('_forward_module.', '')
354
+ state_dict[new_k] = ckpt[k]
355
+ else:
356
+ state_dict = ckpt["state_dict"]
357
+
358
+ final_state_dict = {}
359
+ for k, v in state_dict.items():
360
+ if k.startswith('model.'):
361
+ final_state_dict[k.replace('model.', '')] = v
362
+ else:
363
+ final_state_dict[k] = v
364
+ missing, unexpected = self.load_state_dict(final_state_dict, strict=False)
365
+ print('unexpected keys:', unexpected)
366
+ print('missing keys:', missing)
367
+
368
+ def forward(
369
+ self,
370
+ x,
371
+ t,
372
+ contexts,
373
+ **kwargs,
374
+ ) -> Tensor:
375
+ cond = contexts['main']
376
+ latent = self.latent_in(x)
377
+ vec = self.time_in(timestep_embedding(t, 256, self.time_factor).to(dtype=latent.dtype))
378
+ cond = self.cond_in(cond)
379
+ pe = None
380
+
381
+ for block in self.double_blocks:
382
+ latent, cond = block(img=latent, txt=cond, vec=vec, pe=pe)
383
+
384
+ latent = torch.cat((cond, latent), 1)
385
+ for block in self.single_blocks:
386
+ latent = block(latent, vec=vec, pe=pe)
387
+
388
+ latent = latent[:, cond.shape[1]:, ...]
389
+ latent = self.final_layer(latent, vec)
390
+ return latent
hy3dgen/shapegen/models/vae.py ADDED
@@ -0,0 +1,636 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source Model Licensed under the Apache License Version 2.0
2
+ # and Other Licenses of the Third-Party Components therein:
3
+ # The below Model in this distribution may have been modified by THL A29 Limited
4
+ # ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
5
+
6
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
7
+ # The below software and/or models in this distribution may have been
8
+ # modified by THL A29 Limited ("Tencent Modifications").
9
+ # All Tencent Modifications are Copyright (C) THL A29 Limited.
10
+
11
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
12
+ # except for the third-party components listed below.
13
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
14
+ # in the repsective licenses of these third-party components.
15
+ # Users must comply with all terms and conditions of original licenses of these third-party
16
+ # components and must ensure that the usage of the third party components adheres to
17
+ # all relevant laws and regulations.
18
+
19
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
20
+ # their software and algorithms, including trained model weights, parameters (including
21
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
22
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
23
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
24
+
25
+ from typing import Tuple, List, Union, Optional
26
+
27
+ import numpy as np
28
+ import torch
29
+ import torch.nn as nn
30
+ import torch.nn.functional as F
31
+ from einops import rearrange, repeat
32
+ from skimage import measure
33
+ from tqdm import tqdm
34
+
35
+
36
+ class FourierEmbedder(nn.Module):
37
+ """The sin/cosine positional embedding. Given an input tensor `x` of shape [n_batch, ..., c_dim], it converts
38
+ each feature dimension of `x[..., i]` into:
39
+ [
40
+ sin(x[..., i]),
41
+ sin(f_1*x[..., i]),
42
+ sin(f_2*x[..., i]),
43
+ ...
44
+ sin(f_N * x[..., i]),
45
+ cos(x[..., i]),
46
+ cos(f_1*x[..., i]),
47
+ cos(f_2*x[..., i]),
48
+ ...
49
+ cos(f_N * x[..., i]),
50
+ x[..., i] # only present if include_input is True.
51
+ ], here f_i is the frequency.
52
+
53
+ Denote the space is [0 / num_freqs, 1 / num_freqs, 2 / num_freqs, 3 / num_freqs, ..., (num_freqs - 1) / num_freqs].
54
+ If logspace is True, then the frequency f_i is [2^(0 / num_freqs), ..., 2^(i / num_freqs), ...];
55
+ Otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1)].
56
+
57
+ Args:
58
+ num_freqs (int): the number of frequencies, default is 6;
59
+ logspace (bool): If logspace is True, then the frequency f_i is [..., 2^(i / num_freqs), ...],
60
+ otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1)];
61
+ input_dim (int): the input dimension, default is 3;
62
+ include_input (bool): include the input tensor or not, default is True.
63
+
64
+ Attributes:
65
+ frequencies (torch.Tensor): If logspace is True, then the frequency f_i is [..., 2^(i / num_freqs), ...],
66
+ otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1);
67
+
68
+ out_dim (int): the embedding size, if include_input is True, it is input_dim * (num_freqs * 2 + 1),
69
+ otherwise, it is input_dim * num_freqs * 2.
70
+
71
+ """
72
+
73
+ def __init__(self,
74
+ num_freqs: int = 6,
75
+ logspace: bool = True,
76
+ input_dim: int = 3,
77
+ include_input: bool = True,
78
+ include_pi: bool = True) -> None:
79
+
80
+ """The initialization"""
81
+
82
+ super().__init__()
83
+
84
+ if logspace:
85
+ frequencies = 2.0 ** torch.arange(
86
+ num_freqs,
87
+ dtype=torch.float32
88
+ )
89
+ else:
90
+ frequencies = torch.linspace(
91
+ 1.0,
92
+ 2.0 ** (num_freqs - 1),
93
+ num_freqs,
94
+ dtype=torch.float32
95
+ )
96
+
97
+ if include_pi:
98
+ frequencies *= torch.pi
99
+
100
+ self.register_buffer("frequencies", frequencies, persistent=False)
101
+ self.include_input = include_input
102
+ self.num_freqs = num_freqs
103
+
104
+ self.out_dim = self.get_dims(input_dim)
105
+
106
+ def get_dims(self, input_dim):
107
+ temp = 1 if self.include_input or self.num_freqs == 0 else 0
108
+ out_dim = input_dim * (self.num_freqs * 2 + temp)
109
+
110
+ return out_dim
111
+
112
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
113
+ """ Forward process.
114
+
115
+ Args:
116
+ x: tensor of shape [..., dim]
117
+
118
+ Returns:
119
+ embedding: an embedding of `x` of shape [..., dim * (num_freqs * 2 + temp)]
120
+ where temp is 1 if include_input is True and 0 otherwise.
121
+ """
122
+
123
+ if self.num_freqs > 0:
124
+ embed = (x[..., None].contiguous() * self.frequencies).view(*x.shape[:-1], -1)
125
+ if self.include_input:
126
+ return torch.cat((x, embed.sin(), embed.cos()), dim=-1)
127
+ else:
128
+ return torch.cat((embed.sin(), embed.cos()), dim=-1)
129
+ else:
130
+ return x
131
+
132
+
133
+ class DropPath(nn.Module):
134
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
135
+ """
136
+
137
+ def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True):
138
+ super(DropPath, self).__init__()
139
+ self.drop_prob = drop_prob
140
+ self.scale_by_keep = scale_by_keep
141
+
142
+ def forward(self, x):
143
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
144
+
145
+ This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
146
+ the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
147
+ See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
148
+ changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
149
+ 'survival rate' as the argument.
150
+
151
+ """
152
+ if self.drop_prob == 0. or not self.training:
153
+ return x
154
+ keep_prob = 1 - self.drop_prob
155
+ shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
156
+ random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
157
+ if keep_prob > 0.0 and self.scale_by_keep:
158
+ random_tensor.div_(keep_prob)
159
+ return x * random_tensor
160
+
161
+ def extra_repr(self):
162
+ return f'drop_prob={round(self.drop_prob, 3):0.3f}'
163
+
164
+
165
+ class MLP(nn.Module):
166
+ def __init__(
167
+ self, *,
168
+ width: int,
169
+ output_width: int = None,
170
+ drop_path_rate: float = 0.0
171
+ ):
172
+ super().__init__()
173
+ self.width = width
174
+ self.c_fc = nn.Linear(width, width * 4)
175
+ self.c_proj = nn.Linear(width * 4, output_width if output_width is not None else width)
176
+ self.gelu = nn.GELU()
177
+ self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
178
+
179
+ def forward(self, x):
180
+ return self.drop_path(self.c_proj(self.gelu(self.c_fc(x))))
181
+
182
+
183
+ class QKVMultiheadCrossAttention(nn.Module):
184
+ def __init__(
185
+ self,
186
+ *,
187
+ heads: int,
188
+ n_data: Optional[int] = None,
189
+ width=None,
190
+ qk_norm=False,
191
+ norm_layer=nn.LayerNorm
192
+ ):
193
+ super().__init__()
194
+ self.heads = heads
195
+ self.n_data = n_data
196
+ self.q_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
197
+ self.k_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
198
+
199
+ def forward(self, q, kv):
200
+ _, n_ctx, _ = q.shape
201
+ bs, n_data, width = kv.shape
202
+ attn_ch = width // self.heads // 2
203
+ q = q.view(bs, n_ctx, self.heads, -1)
204
+ kv = kv.view(bs, n_data, self.heads, -1)
205
+ k, v = torch.split(kv, attn_ch, dim=-1)
206
+
207
+ q = self.q_norm(q)
208
+ k = self.k_norm(k)
209
+
210
+ q, k, v = map(lambda t: rearrange(t, 'b n h d -> b h n d', h=self.heads), (q, k, v))
211
+ out = F.scaled_dot_product_attention(q, k, v).transpose(1, 2).reshape(bs, n_ctx, -1)
212
+
213
+ return out
214
+
215
+
216
+ class MultiheadCrossAttention(nn.Module):
217
+ def __init__(
218
+ self,
219
+ *,
220
+ width: int,
221
+ heads: int,
222
+ qkv_bias: bool = True,
223
+ n_data: Optional[int] = None,
224
+ data_width: Optional[int] = None,
225
+ norm_layer=nn.LayerNorm,
226
+ qk_norm: bool = False
227
+ ):
228
+ super().__init__()
229
+ self.n_data = n_data
230
+ self.width = width
231
+ self.heads = heads
232
+ self.data_width = width if data_width is None else data_width
233
+ self.c_q = nn.Linear(width, width, bias=qkv_bias)
234
+ self.c_kv = nn.Linear(self.data_width, width * 2, bias=qkv_bias)
235
+ self.c_proj = nn.Linear(width, width)
236
+ self.attention = QKVMultiheadCrossAttention(
237
+ heads=heads,
238
+ n_data=n_data,
239
+ width=width,
240
+ norm_layer=norm_layer,
241
+ qk_norm=qk_norm
242
+ )
243
+
244
+ def forward(self, x, data):
245
+ x = self.c_q(x)
246
+ data = self.c_kv(data)
247
+ x = self.attention(x, data)
248
+ x = self.c_proj(x)
249
+ return x
250
+
251
+
252
+ class ResidualCrossAttentionBlock(nn.Module):
253
+ def __init__(
254
+ self,
255
+ *,
256
+ n_data: Optional[int] = None,
257
+ width: int,
258
+ heads: int,
259
+ data_width: Optional[int] = None,
260
+ qkv_bias: bool = True,
261
+ norm_layer=nn.LayerNorm,
262
+ qk_norm: bool = False
263
+ ):
264
+ super().__init__()
265
+
266
+ if data_width is None:
267
+ data_width = width
268
+
269
+ self.attn = MultiheadCrossAttention(
270
+ n_data=n_data,
271
+ width=width,
272
+ heads=heads,
273
+ data_width=data_width,
274
+ qkv_bias=qkv_bias,
275
+ norm_layer=norm_layer,
276
+ qk_norm=qk_norm
277
+ )
278
+ self.ln_1 = norm_layer(width, elementwise_affine=True, eps=1e-6)
279
+ self.ln_2 = norm_layer(data_width, elementwise_affine=True, eps=1e-6)
280
+ self.ln_3 = norm_layer(width, elementwise_affine=True, eps=1e-6)
281
+ self.mlp = MLP(width=width)
282
+
283
+ def forward(self, x: torch.Tensor, data: torch.Tensor):
284
+ x = x + self.attn(self.ln_1(x), self.ln_2(data))
285
+ x = x + self.mlp(self.ln_3(x))
286
+ return x
287
+
288
+
289
+ class QKVMultiheadAttention(nn.Module):
290
+ def __init__(
291
+ self,
292
+ *,
293
+ heads: int,
294
+ n_ctx: int,
295
+ width=None,
296
+ qk_norm=False,
297
+ norm_layer=nn.LayerNorm
298
+ ):
299
+ super().__init__()
300
+ self.heads = heads
301
+ self.n_ctx = n_ctx
302
+ self.q_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
303
+ self.k_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
304
+
305
+ def forward(self, qkv):
306
+ bs, n_ctx, width = qkv.shape
307
+ attn_ch = width // self.heads // 3
308
+ qkv = qkv.view(bs, n_ctx, self.heads, -1)
309
+ q, k, v = torch.split(qkv, attn_ch, dim=-1)
310
+
311
+ q = self.q_norm(q)
312
+ k = self.k_norm(k)
313
+
314
+ q, k, v = map(lambda t: rearrange(t, 'b n h d -> b h n d', h=self.heads), (q, k, v))
315
+ out = F.scaled_dot_product_attention(q, k, v).transpose(1, 2).reshape(bs, n_ctx, -1)
316
+ return out
317
+
318
+
319
+ class MultiheadAttention(nn.Module):
320
+ def __init__(
321
+ self,
322
+ *,
323
+ n_ctx: int,
324
+ width: int,
325
+ heads: int,
326
+ qkv_bias: bool,
327
+ norm_layer=nn.LayerNorm,
328
+ qk_norm: bool = False,
329
+ drop_path_rate: float = 0.0
330
+ ):
331
+ super().__init__()
332
+ self.n_ctx = n_ctx
333
+ self.width = width
334
+ self.heads = heads
335
+ self.c_qkv = nn.Linear(width, width * 3, bias=qkv_bias)
336
+ self.c_proj = nn.Linear(width, width)
337
+ self.attention = QKVMultiheadAttention(
338
+ heads=heads,
339
+ n_ctx=n_ctx,
340
+ width=width,
341
+ norm_layer=norm_layer,
342
+ qk_norm=qk_norm
343
+ )
344
+ self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
345
+
346
+ def forward(self, x):
347
+ x = self.c_qkv(x)
348
+ x = self.attention(x)
349
+ x = self.drop_path(self.c_proj(x))
350
+ return x
351
+
352
+
353
+ class ResidualAttentionBlock(nn.Module):
354
+ def __init__(
355
+ self,
356
+ *,
357
+ n_ctx: int,
358
+ width: int,
359
+ heads: int,
360
+ qkv_bias: bool = True,
361
+ norm_layer=nn.LayerNorm,
362
+ qk_norm: bool = False,
363
+ drop_path_rate: float = 0.0,
364
+ ):
365
+ super().__init__()
366
+ self.attn = MultiheadAttention(
367
+ n_ctx=n_ctx,
368
+ width=width,
369
+ heads=heads,
370
+ qkv_bias=qkv_bias,
371
+ norm_layer=norm_layer,
372
+ qk_norm=qk_norm,
373
+ drop_path_rate=drop_path_rate
374
+ )
375
+ self.ln_1 = norm_layer(width, elementwise_affine=True, eps=1e-6)
376
+ self.mlp = MLP(width=width, drop_path_rate=drop_path_rate)
377
+ self.ln_2 = norm_layer(width, elementwise_affine=True, eps=1e-6)
378
+
379
+ def forward(self, x: torch.Tensor):
380
+ x = x + self.attn(self.ln_1(x))
381
+ x = x + self.mlp(self.ln_2(x))
382
+ return x
383
+
384
+
385
+ class Transformer(nn.Module):
386
+ def __init__(
387
+ self,
388
+ *,
389
+ n_ctx: int,
390
+ width: int,
391
+ layers: int,
392
+ heads: int,
393
+ qkv_bias: bool = True,
394
+ norm_layer=nn.LayerNorm,
395
+ qk_norm: bool = False,
396
+ drop_path_rate: float = 0.0
397
+ ):
398
+ super().__init__()
399
+ self.n_ctx = n_ctx
400
+ self.width = width
401
+ self.layers = layers
402
+ self.resblocks = nn.ModuleList(
403
+ [
404
+ ResidualAttentionBlock(
405
+ n_ctx=n_ctx,
406
+ width=width,
407
+ heads=heads,
408
+ qkv_bias=qkv_bias,
409
+ norm_layer=norm_layer,
410
+ qk_norm=qk_norm,
411
+ drop_path_rate=drop_path_rate
412
+ )
413
+ for _ in range(layers)
414
+ ]
415
+ )
416
+
417
+ def forward(self, x: torch.Tensor):
418
+ for block in self.resblocks:
419
+ x = block(x)
420
+ return x
421
+
422
+
423
+ class CrossAttentionDecoder(nn.Module):
424
+
425
+ def __init__(
426
+ self,
427
+ *,
428
+ num_latents: int,
429
+ out_channels: int,
430
+ fourier_embedder: FourierEmbedder,
431
+ width: int,
432
+ heads: int,
433
+ qkv_bias: bool = True,
434
+ qk_norm: bool = False,
435
+ label_type: str = "binary"
436
+ ):
437
+ super().__init__()
438
+
439
+ self.fourier_embedder = fourier_embedder
440
+
441
+ self.query_proj = nn.Linear(self.fourier_embedder.out_dim, width)
442
+
443
+ self.cross_attn_decoder = ResidualCrossAttentionBlock(
444
+ n_data=num_latents,
445
+ width=width,
446
+ heads=heads,
447
+ qkv_bias=qkv_bias,
448
+ qk_norm=qk_norm
449
+ )
450
+
451
+ self.ln_post = nn.LayerNorm(width)
452
+ self.output_proj = nn.Linear(width, out_channels)
453
+ self.label_type = label_type
454
+
455
+ def forward(self, queries: torch.FloatTensor, latents: torch.FloatTensor):
456
+ queries = self.query_proj(self.fourier_embedder(queries).to(latents.dtype))
457
+ x = self.cross_attn_decoder(queries, latents)
458
+ x = self.ln_post(x)
459
+ occ = self.output_proj(x)
460
+ return occ
461
+
462
+
463
+ def generate_dense_grid_points(bbox_min: np.ndarray,
464
+ bbox_max: np.ndarray,
465
+ octree_depth: int,
466
+ indexing: str = "ij",
467
+ octree_resolution: int = None,
468
+ ):
469
+ length = bbox_max - bbox_min
470
+ num_cells = np.exp2(octree_depth)
471
+ if octree_resolution is not None:
472
+ num_cells = octree_resolution
473
+
474
+ x = np.linspace(bbox_min[0], bbox_max[0], int(num_cells) + 1, dtype=np.float32)
475
+ y = np.linspace(bbox_min[1], bbox_max[1], int(num_cells) + 1, dtype=np.float32)
476
+ z = np.linspace(bbox_min[2], bbox_max[2], int(num_cells) + 1, dtype=np.float32)
477
+ [xs, ys, zs] = np.meshgrid(x, y, z, indexing=indexing)
478
+ xyz = np.stack((xs, ys, zs), axis=-1)
479
+ xyz = xyz.reshape(-1, 3)
480
+ grid_size = [int(num_cells) + 1, int(num_cells) + 1, int(num_cells) + 1]
481
+
482
+ return xyz, grid_size, length
483
+
484
+
485
+ def center_vertices(vertices):
486
+ """Translate the vertices so that bounding box is centered at zero."""
487
+ vert_min = vertices.min(dim=0)[0]
488
+ vert_max = vertices.max(dim=0)[0]
489
+ vert_center = 0.5 * (vert_min + vert_max)
490
+ return vertices - vert_center
491
+
492
+
493
+ class Latent2MeshOutput:
494
+
495
+ def __init__(self, mesh_v=None, mesh_f=None):
496
+ self.mesh_v = mesh_v
497
+ self.mesh_f = mesh_f
498
+
499
+
500
+ class ShapeVAE(nn.Module):
501
+ def __init__(
502
+ self,
503
+ *,
504
+ num_latents: int,
505
+ embed_dim: int,
506
+ width: int,
507
+ heads: int,
508
+ num_decoder_layers: int,
509
+ num_freqs: int = 8,
510
+ include_pi: bool = True,
511
+ qkv_bias: bool = True,
512
+ qk_norm: bool = False,
513
+ label_type: str = "binary",
514
+ drop_path_rate: float = 0.0,
515
+ scale_factor: float = 1.0,
516
+ ):
517
+ super().__init__()
518
+ self.fourier_embedder = FourierEmbedder(num_freqs=num_freqs, include_pi=include_pi)
519
+
520
+ self.post_kl = nn.Linear(embed_dim, width)
521
+
522
+ self.transformer = Transformer(
523
+ n_ctx=num_latents,
524
+ width=width,
525
+ layers=num_decoder_layers,
526
+ heads=heads,
527
+ qkv_bias=qkv_bias,
528
+ qk_norm=qk_norm,
529
+ drop_path_rate=drop_path_rate
530
+ )
531
+
532
+ self.geo_decoder = CrossAttentionDecoder(
533
+ fourier_embedder=self.fourier_embedder,
534
+ out_channels=1,
535
+ num_latents=num_latents,
536
+ width=width,
537
+ heads=heads,
538
+ qkv_bias=qkv_bias,
539
+ qk_norm=qk_norm,
540
+ label_type=label_type,
541
+ )
542
+
543
+ self.scale_factor = scale_factor
544
+ self.latent_shape = (num_latents, embed_dim)
545
+
546
+ def forward(self, latents):
547
+ latents = self.post_kl(latents)
548
+ latents = self.transformer(latents)
549
+ return latents
550
+
551
+ @torch.no_grad()
552
+ def latents2mesh(
553
+ self,
554
+ latents: torch.FloatTensor,
555
+ bounds: Union[Tuple[float], List[float], float] = 1.1,
556
+ octree_depth: int = 7,
557
+ num_chunks: int = 10000,
558
+ mc_level: float = -1 / 512,
559
+ octree_resolution: int = None,
560
+ mc_algo: str = 'dmc',
561
+ ):
562
+ device = latents.device
563
+
564
+ # 1. generate query points
565
+ if isinstance(bounds, float):
566
+ bounds = [-bounds, -bounds, -bounds, bounds, bounds, bounds]
567
+ bbox_min = np.array(bounds[0:3])
568
+ bbox_max = np.array(bounds[3:6])
569
+ bbox_size = bbox_max - bbox_min
570
+ xyz_samples, grid_size, length = generate_dense_grid_points(
571
+ bbox_min=bbox_min,
572
+ bbox_max=bbox_max,
573
+ octree_depth=octree_depth,
574
+ octree_resolution=octree_resolution,
575
+ indexing="ij"
576
+ )
577
+ xyz_samples = torch.FloatTensor(xyz_samples)
578
+
579
+ # 2. latents to 3d volume
580
+ batch_logits = []
581
+ batch_size = latents.shape[0]
582
+ for start in tqdm(range(0, xyz_samples.shape[0], num_chunks),
583
+ desc=f"MC Level {mc_level} Implicit Function:"):
584
+ queries = xyz_samples[start: start + num_chunks, :].to(device)
585
+ queries = queries.half()
586
+ batch_queries = repeat(queries, "p c -> b p c", b=batch_size)
587
+
588
+ logits = self.geo_decoder(batch_queries.to(latents.dtype), latents)
589
+ if mc_level == -1:
590
+ mc_level = 0
591
+ logits = torch.sigmoid(logits) * 2 - 1
592
+ print(f'Training with soft labels, inference with sigmoid and marching cubes level 0.')
593
+ batch_logits.append(logits)
594
+ grid_logits = torch.cat(batch_logits, dim=1)
595
+ grid_logits = grid_logits.view((batch_size, grid_size[0], grid_size[1], grid_size[2])).float()
596
+
597
+ # 3. extract surface
598
+ outputs = []
599
+ for i in range(batch_size):
600
+ try:
601
+ if mc_algo == 'mc':
602
+ vertices, faces, normals, _ = measure.marching_cubes(
603
+ grid_logits[i].cpu().numpy(),
604
+ mc_level,
605
+ method="lewiner"
606
+ )
607
+ vertices = vertices / grid_size * bbox_size + bbox_min
608
+ elif mc_algo == 'dmc':
609
+ if not hasattr(self, 'dmc'):
610
+ try:
611
+ from diso import DiffDMC
612
+ except:
613
+ raise ImportError("Please install diso via `pip install diso`, or set mc_algo to 'mc'")
614
+ self.dmc = DiffDMC(dtype=torch.float32).to(device)
615
+ octree_resolution = 2 ** octree_depth if octree_resolution is None else octree_resolution
616
+ sdf = -grid_logits[i] / octree_resolution
617
+ verts, faces = self.dmc(sdf, deform=None, return_quads=False, normalize=True)
618
+ verts = center_vertices(verts)
619
+ vertices = verts.detach().cpu().numpy()
620
+ faces = faces.detach().cpu().numpy()[:, ::-1]
621
+ else:
622
+ raise ValueError(f"mc_algo {mc_algo} not supported.")
623
+
624
+ outputs.append(
625
+ Latent2MeshOutput(
626
+ mesh_v=vertices.astype(np.float32),
627
+ mesh_f=np.ascontiguousarray(faces)
628
+ )
629
+ )
630
+
631
+ except ValueError:
632
+ outputs.append(None)
633
+ except RuntimeError:
634
+ outputs.append(None)
635
+
636
+ return outputs
hy3dgen/shapegen/pipelines.py ADDED
@@ -0,0 +1,589 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source Model Licensed under the Apache License Version 2.0
2
+ # and Other Licenses of the Third-Party Components therein:
3
+ # The below Model in this distribution may have been modified by THL A29 Limited
4
+ # ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
5
+
6
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
7
+ # The below software and/or models in this distribution may have been
8
+ # modified by THL A29 Limited ("Tencent Modifications").
9
+ # All Tencent Modifications are Copyright (C) THL A29 Limited.
10
+
11
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
12
+ # except for the third-party components listed below.
13
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
14
+ # in the repsective licenses of these third-party components.
15
+ # Users must comply with all terms and conditions of original licenses of these third-party
16
+ # components and must ensure that the usage of the third party components adheres to
17
+ # all relevant laws and regulations.
18
+
19
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
20
+ # their software and algorithms, including trained model weights, parameters (including
21
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
22
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
23
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
24
+
25
+ import copy
26
+ import importlib
27
+ import inspect
28
+ import logging
29
+ import os
30
+ from typing import List, Optional, Union
31
+
32
+ import numpy as np
33
+ import torch
34
+ import trimesh
35
+ import yaml
36
+ from PIL import Image
37
+ from diffusers.utils.torch_utils import randn_tensor
38
+ from tqdm import tqdm
39
+
40
+ logger = logging.getLogger(__name__)
41
+
42
+
43
+ def retrieve_timesteps(
44
+ scheduler,
45
+ num_inference_steps: Optional[int] = None,
46
+ device: Optional[Union[str, torch.device]] = None,
47
+ timesteps: Optional[List[int]] = None,
48
+ sigmas: Optional[List[float]] = None,
49
+ **kwargs,
50
+ ):
51
+ """
52
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
53
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
54
+
55
+ Args:
56
+ scheduler (`SchedulerMixin`):
57
+ The scheduler to get timesteps from.
58
+ num_inference_steps (`int`):
59
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
60
+ must be `None`.
61
+ device (`str` or `torch.device`, *optional*):
62
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
63
+ timesteps (`List[int]`, *optional*):
64
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
65
+ `num_inference_steps` and `sigmas` must be `None`.
66
+ sigmas (`List[float]`, *optional*):
67
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
68
+ `num_inference_steps` and `timesteps` must be `None`.
69
+
70
+ Returns:
71
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
72
+ second element is the number of inference steps.
73
+ """
74
+ if timesteps is not None and sigmas is not None:
75
+ raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
76
+ if timesteps is not None:
77
+ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
78
+ if not accepts_timesteps:
79
+ raise ValueError(
80
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
81
+ f" timestep schedules. Please check whether you are using the correct scheduler."
82
+ )
83
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
84
+ timesteps = scheduler.timesteps
85
+ num_inference_steps = len(timesteps)
86
+ elif sigmas is not None:
87
+ accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
88
+ if not accept_sigmas:
89
+ raise ValueError(
90
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
91
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
92
+ )
93
+ scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
94
+ timesteps = scheduler.timesteps
95
+ num_inference_steps = len(timesteps)
96
+ else:
97
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
98
+ timesteps = scheduler.timesteps
99
+ return timesteps, num_inference_steps
100
+
101
+
102
+ def export_to_trimesh(mesh_output):
103
+ if isinstance(mesh_output, list):
104
+ outputs = []
105
+ for mesh in mesh_output:
106
+ if mesh is None:
107
+ outputs.append(None)
108
+ else:
109
+ mesh.mesh_f = mesh.mesh_f[:, ::-1]
110
+ mesh_output = trimesh.Trimesh(mesh.mesh_v, mesh.mesh_f)
111
+ outputs.append(mesh_output)
112
+ return outputs
113
+ else:
114
+ mesh_output.mesh_f = mesh_output.mesh_f[:, ::-1]
115
+ mesh_output = trimesh.Trimesh(mesh_output.mesh_v, mesh_output.mesh_f)
116
+ return mesh_output
117
+
118
+
119
+ def get_obj_from_str(string, reload=False):
120
+ module, cls = string.rsplit(".", 1)
121
+ if reload:
122
+ module_imp = importlib.import_module(module)
123
+ importlib.reload(module_imp)
124
+ return getattr(importlib.import_module(module, package=None), cls)
125
+
126
+
127
+ def instantiate_from_config(config, **kwargs):
128
+ if "target" not in config:
129
+ raise KeyError("Expected key `target` to instantiate.")
130
+ cls = get_obj_from_str(config["target"])
131
+ params = config.get("params", dict())
132
+ kwargs.update(params)
133
+ instance = cls(**kwargs)
134
+ return instance
135
+
136
+
137
+ class Hunyuan3DDiTPipeline:
138
+ @classmethod
139
+ def from_single_file(
140
+ cls,
141
+ ckpt_path,
142
+ config_path,
143
+ device='cpu',
144
+ dtype=torch.float16,
145
+ **kwargs,
146
+ ):
147
+ # load config
148
+ with open(config_path, 'r') as f:
149
+ config = yaml.safe_load(f)
150
+
151
+ # load ckpt
152
+ if not os.path.exists(ckpt_path):
153
+ raise FileNotFoundError(f"Model file {ckpt_path} not found")
154
+ logger.info(f"Loading model from {ckpt_path}")
155
+
156
+ if ckpt_path.endswith('.safetensors'):
157
+ # parse safetensors
158
+ import safetensors.torch
159
+ safetensors_ckpt = safetensors.torch.load_file(ckpt_path, device='cpu')
160
+ ckpt = {}
161
+ for key, value in safetensors_ckpt.items():
162
+ model_name = key.split('.')[0]
163
+ new_key = key[len(model_name) + 1:]
164
+ if model_name not in ckpt:
165
+ ckpt[model_name] = {}
166
+ ckpt[model_name][new_key] = value
167
+ else:
168
+ ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=True)
169
+
170
+ # load model
171
+ from accelerate import init_empty_weights
172
+ with init_empty_weights():
173
+ model = instantiate_from_config(config['model'])
174
+ vae = instantiate_from_config(config['vae'])
175
+ conditioner = instantiate_from_config(config['conditioner'])
176
+ image_processor = instantiate_from_config(config['image_processor'])
177
+ scheduler = instantiate_from_config(config['scheduler'])
178
+
179
+ model.load_state_dict(ckpt['model'], assign = True)
180
+ vae.load_state_dict(ckpt['vae'], assign = True)
181
+ if 'conditioner' in ckpt:
182
+ conditioner.load_state_dict(ckpt['conditioner'], assign = True)
183
+
184
+ model_kwargs = dict(
185
+ vae=vae,
186
+ model=model,
187
+ scheduler=scheduler,
188
+ conditioner=conditioner,
189
+ image_processor=image_processor,
190
+ device=device,
191
+ dtype=dtype,
192
+ )
193
+ model_kwargs.update(kwargs)
194
+
195
+ return cls(
196
+ **model_kwargs
197
+ )
198
+
199
+ @classmethod
200
+ def from_pretrained(
201
+ cls,
202
+ model_path,
203
+ device='cuda',
204
+ dtype=torch.float16,
205
+ use_safetensors=None,
206
+ variant=None,
207
+ subfolder='hunyuan3d-dit-v2-0',
208
+ **kwargs,
209
+ ):
210
+ original_model_path = model_path
211
+ if not os.path.exists(model_path):
212
+ # try local path
213
+ base_dir = os.environ.get('HY3DGEN_MODELS', '~/.cache/hy3dgen')
214
+ model_path = os.path.expanduser(os.path.join(base_dir, model_path, subfolder))
215
+ if not os.path.exists(model_path):
216
+ try:
217
+ import huggingface_hub
218
+ # download from huggingface
219
+ path = huggingface_hub.snapshot_download(repo_id=original_model_path)
220
+ model_path = os.path.join(path, subfolder)
221
+ except ImportError:
222
+ logger.warning(
223
+ "You need to install HuggingFace Hub to load models from the hub."
224
+ )
225
+ raise RuntimeError(f"Model path {model_path} not found")
226
+ if not os.path.exists(model_path):
227
+ raise FileNotFoundError(f"Model path {original_model_path} not found")
228
+
229
+ extension = 'ckpt' if not use_safetensors else 'safetensors'
230
+ variant = '' if variant is None else f'.{variant}'
231
+ ckpt_name = f'model{variant}.{extension}'
232
+ config_path = os.path.join(model_path, 'config.yaml')
233
+ ckpt_path = os.path.join(model_path, ckpt_name)
234
+
235
+ return cls.from_single_file(
236
+ ckpt_path,
237
+ config_path,
238
+ device=device,
239
+ dtype=dtype,
240
+ use_safetensors=use_safetensors,
241
+ variant=variant,
242
+ **kwargs
243
+ )
244
+
245
+ def __init__(
246
+ self,
247
+ vae,
248
+ model,
249
+ scheduler,
250
+ conditioner,
251
+ image_processor,
252
+ device='cuda',
253
+ dtype=torch.float16,
254
+ **kwargs
255
+ ):
256
+ self.vae = vae
257
+ self.model = model
258
+ self.scheduler = scheduler
259
+ self.conditioner = conditioner
260
+ self.image_processor = image_processor
261
+
262
+ self.to(device, dtype)
263
+
264
+ def to(self, device=None, dtype=None):
265
+ if device is not None:
266
+ self.device = torch.device(device)
267
+ self.vae.to(device)
268
+ self.model.to(device)
269
+ self.conditioner.to(device)
270
+ if dtype is not None:
271
+ self.dtype = dtype
272
+ self.vae.to(dtype=dtype)
273
+ self.model.to(dtype=dtype)
274
+ self.conditioner.to(dtype=dtype)
275
+
276
+ def encode_cond(self, image, mask, do_classifier_free_guidance, dual_guidance):
277
+ bsz = image.shape[0]
278
+ cond = self.conditioner(image=image, mask=mask)
279
+
280
+ if do_classifier_free_guidance:
281
+ un_cond = self.conditioner.unconditional_embedding(bsz)
282
+
283
+ if dual_guidance:
284
+ un_cond_drop_main = copy.deepcopy(un_cond)
285
+ un_cond_drop_main['additional'] = cond['additional']
286
+
287
+ def cat_recursive(a, b, c):
288
+ if isinstance(a, torch.Tensor):
289
+ return torch.cat([a, b, c], dim=0).to(self.dtype)
290
+ out = {}
291
+ for k in a.keys():
292
+ out[k] = cat_recursive(a[k], b[k], c[k])
293
+ return out
294
+
295
+ cond = cat_recursive(cond, un_cond_drop_main, un_cond)
296
+ else:
297
+ un_cond = self.conditioner.unconditional_embedding(bsz)
298
+
299
+ def cat_recursive(a, b):
300
+ if isinstance(a, torch.Tensor):
301
+ return torch.cat([a, b], dim=0).to(self.dtype)
302
+ out = {}
303
+ for k in a.keys():
304
+ out[k] = cat_recursive(a[k], b[k])
305
+ return out
306
+
307
+ cond = cat_recursive(cond, un_cond)
308
+ return cond
309
+
310
+ def prepare_extra_step_kwargs(self, generator, eta):
311
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
312
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
313
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
314
+ # and should be between [0, 1]
315
+
316
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
317
+ extra_step_kwargs = {}
318
+ if accepts_eta:
319
+ extra_step_kwargs["eta"] = eta
320
+
321
+ # check if the scheduler accepts generator
322
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
323
+ if accepts_generator:
324
+ extra_step_kwargs["generator"] = generator
325
+ return extra_step_kwargs
326
+
327
+ def prepare_latents(self, batch_size, dtype, device, generator, latents=None):
328
+ shape = (batch_size, *self.vae.latent_shape)
329
+ if isinstance(generator, list) and len(generator) != batch_size:
330
+ raise ValueError(
331
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
332
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
333
+ )
334
+
335
+ if latents is None:
336
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
337
+ else:
338
+ latents = latents.to(device)
339
+
340
+ # scale the initial noise by the standard deviation required by the scheduler
341
+ latents = latents * getattr(self.scheduler, 'init_noise_sigma', 1.0)
342
+ return latents
343
+
344
+ def prepare_image(self, image):
345
+ if isinstance(image, str) and not os.path.exists(image):
346
+ raise FileNotFoundError(f"Couldn't find image at path {image}")
347
+
348
+ if not isinstance(image, list):
349
+ image = [image]
350
+ image_pts = []
351
+ mask_pts = []
352
+ for img in image:
353
+ image_pt, mask_pt = self.image_processor(img, return_mask=True)
354
+ image_pts.append(image_pt)
355
+ mask_pts.append(mask_pt)
356
+
357
+ image_pts = torch.cat(image_pts, dim=0).to(self.device, dtype=self.dtype)
358
+ if mask_pts[0] is not None:
359
+ mask_pts = torch.cat(mask_pts, dim=0).to(self.device, dtype=self.dtype)
360
+ else:
361
+ mask_pts = None
362
+ return image_pts, mask_pts
363
+
364
+ def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
365
+ """
366
+ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
367
+
368
+ Args:
369
+ timesteps (`torch.Tensor`):
370
+ generate embedding vectors at these timesteps
371
+ embedding_dim (`int`, *optional*, defaults to 512):
372
+ dimension of the embeddings to generate
373
+ dtype:
374
+ data type of the generated embeddings
375
+
376
+ Returns:
377
+ `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
378
+ """
379
+ assert len(w.shape) == 1
380
+ w = w * 1000.0
381
+
382
+ half_dim = embedding_dim // 2
383
+ emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
384
+ emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
385
+ emb = w.to(dtype)[:, None] * emb[None, :]
386
+ emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
387
+ if embedding_dim % 2 == 1: # zero pad
388
+ emb = torch.nn.functional.pad(emb, (0, 1))
389
+ assert emb.shape == (w.shape[0], embedding_dim)
390
+ return emb
391
+
392
+ @torch.no_grad()
393
+ def __call__(
394
+ self,
395
+ image: Union[str, List[str], Image.Image] = None,
396
+ num_inference_steps: int = 50,
397
+ timesteps: List[int] = None,
398
+ sigmas: List[float] = None,
399
+ eta: float = 0.0,
400
+ guidance_scale: float = 7.5,
401
+ dual_guidance_scale: float = 10.5,
402
+ dual_guidance: bool = True,
403
+ generator=None,
404
+ box_v=1.01,
405
+ octree_resolution=384,
406
+ mc_level=-1 / 512,
407
+ num_chunks=8000,
408
+ mc_algo='mc',
409
+ output_type: Optional[str] = "trimesh",
410
+ enable_pbar=True,
411
+ **kwargs,
412
+ ) -> List[List[trimesh.Trimesh]]:
413
+ callback = kwargs.pop("callback", None)
414
+ callback_steps = kwargs.pop("callback_steps", None)
415
+
416
+ device = self.device
417
+ dtype = self.dtype
418
+ do_classifier_free_guidance = guidance_scale >= 0 and \
419
+ getattr(self.model, 'guidance_cond_proj_dim', None) is None
420
+ dual_guidance = dual_guidance_scale >= 0 and dual_guidance
421
+
422
+ image, mask = self.prepare_image(image)
423
+ cond = self.encode_cond(image=image,
424
+ mask=mask,
425
+ do_classifier_free_guidance=do_classifier_free_guidance,
426
+ dual_guidance=dual_guidance)
427
+ batch_size = image.shape[0]
428
+
429
+ t_dtype = torch.long
430
+ timesteps, num_inference_steps = retrieve_timesteps(
431
+ self.scheduler, num_inference_steps, device, timesteps, sigmas)
432
+
433
+ latents = self.prepare_latents(batch_size, dtype, device, generator)
434
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
435
+
436
+ guidance_cond = None
437
+ if getattr(self.model, 'guidance_cond_proj_dim', None) is not None:
438
+ print('Using lcm guidance scale')
439
+ guidance_scale_tensor = torch.tensor(guidance_scale - 1).repeat(batch_size)
440
+ guidance_cond = self.get_guidance_scale_embedding(
441
+ guidance_scale_tensor, embedding_dim=self.model.guidance_cond_proj_dim
442
+ ).to(device=device, dtype=latents.dtype)
443
+
444
+ for i, t in enumerate(tqdm(timesteps, disable=not enable_pbar, desc="Diffusion Sampling:", leave=False)):
445
+ # expand the latents if we are doing classifier free guidance
446
+ if do_classifier_free_guidance:
447
+ latent_model_input = torch.cat([latents] * (3 if dual_guidance else 2))
448
+ else:
449
+ latent_model_input = latents
450
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
451
+
452
+ # predict the noise residual
453
+ timestep_tensor = torch.tensor([t], dtype=t_dtype, device=device)
454
+ timestep_tensor = timestep_tensor.expand(latent_model_input.shape[0])
455
+ noise_pred = self.model(latent_model_input, timestep_tensor, cond, guidance_cond=guidance_cond)
456
+
457
+ # no drop, drop clip, all drop
458
+ if do_classifier_free_guidance:
459
+ if dual_guidance:
460
+ noise_pred_clip, noise_pred_dino, noise_pred_uncond = noise_pred.chunk(3)
461
+ noise_pred = (
462
+ noise_pred_uncond
463
+ + guidance_scale * (noise_pred_clip - noise_pred_dino)
464
+ + dual_guidance_scale * (noise_pred_dino - noise_pred_uncond)
465
+ )
466
+ else:
467
+ noise_pred_cond, noise_pred_uncond = noise_pred.chunk(2)
468
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
469
+
470
+ # compute the previous noisy sample x_t -> x_t-1
471
+ outputs = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)
472
+ latents = outputs.prev_sample
473
+
474
+ if callback is not None and i % callback_steps == 0:
475
+ step_idx = i // getattr(self.scheduler, "order", 1)
476
+ callback(step_idx, t, outputs)
477
+
478
+ return self._export(
479
+ latents,
480
+ output_type,
481
+ box_v, mc_level, num_chunks, octree_resolution, mc_algo,
482
+ )
483
+
484
+ def _export(self, latents, output_type, box_v, mc_level, num_chunks, octree_resolution, mc_algo):
485
+ if not output_type == "latent":
486
+ latents = 1. / self.vae.scale_factor * latents
487
+ latents = self.vae(latents)
488
+ outputs = self.vae.latents2mesh(
489
+ latents,
490
+ bounds=box_v,
491
+ mc_level=mc_level,
492
+ num_chunks=num_chunks,
493
+ octree_resolution=octree_resolution,
494
+ mc_algo=mc_algo,
495
+ )
496
+ else:
497
+ outputs = latents
498
+
499
+ if output_type == 'trimesh':
500
+ outputs = export_to_trimesh(outputs)
501
+
502
+ return outputs
503
+
504
+
505
+ class Hunyuan3DDiTFlowMatchingPipeline(Hunyuan3DDiTPipeline):
506
+
507
+ @torch.no_grad()
508
+ def __call__(
509
+ self,
510
+ image: Union[str, List[str], Image.Image] = None,
511
+ num_inference_steps: int = 50,
512
+ timesteps: List[int] = None,
513
+ sigmas: List[float] = None,
514
+ eta: float = 0.0,
515
+ guidance_scale: float = 7.5,
516
+ generator=None,
517
+ box_v=1.01,
518
+ octree_resolution=384,
519
+ mc_level=0.0,
520
+ mc_algo='mc',
521
+ num_chunks=8000,
522
+ output_type: Optional[str] = "trimesh",
523
+ enable_pbar=True,
524
+ **kwargs,
525
+ ) -> List[List[trimesh.Trimesh]]:
526
+ callback = kwargs.pop("callback", None)
527
+ callback_steps = kwargs.pop("callback_steps", None)
528
+
529
+ device = self.device
530
+ dtype = self.dtype
531
+ do_classifier_free_guidance = guidance_scale >= 0 and not (
532
+ hasattr(self.model, 'guidance_embed') and
533
+ self.model.guidance_embed is True
534
+ )
535
+
536
+ image, mask = self.prepare_image(image)
537
+ cond = self.encode_cond(
538
+ image=image,
539
+ mask=mask,
540
+ do_classifier_free_guidance=do_classifier_free_guidance,
541
+ dual_guidance=False,
542
+ )
543
+ batch_size = image.shape[0]
544
+
545
+ # 5. Prepare timesteps
546
+ # NOTE: this is slightly different from common usage, we start from 0.
547
+ sigmas = np.linspace(0, 1, num_inference_steps) if sigmas is None else sigmas
548
+ timesteps, num_inference_steps = retrieve_timesteps(
549
+ self.scheduler,
550
+ num_inference_steps,
551
+ device,
552
+ sigmas=sigmas,
553
+ )
554
+ latents = self.prepare_latents(batch_size, dtype, device, generator)
555
+
556
+ guidance = None
557
+ if hasattr(self.model, 'guidance_embed') and \
558
+ self.model.guidance_embed is True:
559
+ guidance = torch.tensor([guidance_scale] * batch_size, device=device, dtype=dtype)
560
+
561
+ for i, t in enumerate(tqdm(timesteps, disable=not enable_pbar, desc="Diffusion Sampling:")):
562
+ # expand the latents if we are doing classifier free guidance
563
+ if do_classifier_free_guidance:
564
+ latent_model_input = torch.cat([latents] * 2)
565
+ else:
566
+ latent_model_input = latents
567
+
568
+ # NOTE: we assume model get timesteps ranged from 0 to 1
569
+ timestep = t.expand(latent_model_input.shape[0]).to(
570
+ latents.dtype) / self.scheduler.config.num_train_timesteps
571
+ noise_pred = self.model(latent_model_input, timestep, cond, guidance=guidance)
572
+
573
+ if do_classifier_free_guidance:
574
+ noise_pred_cond, noise_pred_uncond = noise_pred.chunk(2)
575
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
576
+
577
+ # compute the previous noisy sample x_t -> x_t-1
578
+ outputs = self.scheduler.step(noise_pred, t, latents)
579
+ latents = outputs.prev_sample
580
+
581
+ if callback is not None and i % callback_steps == 0:
582
+ step_idx = i // getattr(self.scheduler, "order", 1)
583
+ callback(step_idx, t, outputs)
584
+
585
+ return self._export(
586
+ latents,
587
+ output_type,
588
+ box_v, mc_level, num_chunks, octree_resolution, mc_algo,
589
+ )
hy3dgen/shapegen/postprocessors.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source Model Licensed under the Apache License Version 2.0
2
+ # and Other Licenses of the Third-Party Components therein:
3
+ # The below Model in this distribution may have been modified by THL A29 Limited
4
+ # ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
5
+
6
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
7
+ # The below software and/or models in this distribution may have been
8
+ # modified by THL A29 Limited ("Tencent Modifications").
9
+ # All Tencent Modifications are Copyright (C) THL A29 Limited.
10
+
11
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
12
+ # except for the third-party components listed below.
13
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
14
+ # in the repsective licenses of these third-party components.
15
+ # Users must comply with all terms and conditions of original licenses of these third-party
16
+ # components and must ensure that the usage of the third party components adheres to
17
+ # all relevant laws and regulations.
18
+
19
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
20
+ # their software and algorithms, including trained model weights, parameters (including
21
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
22
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
23
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
24
+
25
+ import os
26
+ import tempfile
27
+ from typing import Union
28
+
29
+ import pymeshlab
30
+ import trimesh
31
+
32
+ from .models.vae import Latent2MeshOutput
33
+
34
+
35
+ def load_mesh(path):
36
+ if path.endswith(".glb"):
37
+ mesh = trimesh.load(path)
38
+ else:
39
+ mesh = pymeshlab.MeshSet()
40
+ mesh.load_new_mesh(path)
41
+ return mesh
42
+
43
+
44
+ def reduce_face(mesh: pymeshlab.MeshSet, max_facenum: int = 200000):
45
+ mesh.apply_filter(
46
+ "meshing_decimation_quadric_edge_collapse",
47
+ targetfacenum=max_facenum,
48
+ qualitythr=1.0,
49
+ preserveboundary=True,
50
+ boundaryweight=3,
51
+ preservenormal=True,
52
+ preservetopology=True,
53
+ autoclean=True
54
+ )
55
+ return mesh
56
+
57
+
58
+ def remove_floater(mesh: pymeshlab.MeshSet):
59
+ mesh.apply_filter("compute_selection_by_small_disconnected_components_per_face",
60
+ nbfaceratio=0.005)
61
+ mesh.apply_filter("compute_selection_transfer_face_to_vertex", inclusive=False)
62
+ mesh.apply_filter("meshing_remove_selected_vertices_and_faces")
63
+ return mesh
64
+
65
+
66
+ def pymeshlab2trimesh(mesh: pymeshlab.MeshSet):
67
+ temp_file = tempfile.NamedTemporaryFile(suffix='.ply', delete=True)
68
+ temp_file.close()
69
+ temp_file_name = temp_file.name
70
+
71
+ mesh.save_current_mesh(temp_file_name)
72
+ mesh = trimesh.load(temp_file_name)
73
+ if os.path.exists(temp_file_name):
74
+ os.remove(temp_file_name)
75
+
76
+ # 检查加载的对象类型
77
+ if isinstance(mesh, trimesh.Scene):
78
+ combined_mesh = trimesh.Trimesh()
79
+ # 如果是Scene,遍历所有的geometry并合并
80
+ for geom in mesh.geometry.values():
81
+ combined_mesh = trimesh.util.concatenate([combined_mesh, geom])
82
+ mesh = combined_mesh
83
+ return mesh
84
+
85
+
86
+ def trimesh2pymeshlab(mesh: trimesh.Trimesh):
87
+ temp_file = tempfile.NamedTemporaryFile(suffix='.ply', delete=True)
88
+ temp_file.close()
89
+ temp_file_name = temp_file.name
90
+
91
+ if isinstance(mesh, trimesh.scene.Scene):
92
+ for idx, obj in enumerate(mesh.geometry.values()):
93
+ if idx == 0:
94
+ temp_mesh = obj
95
+ else:
96
+ temp_mesh = temp_mesh + obj
97
+ mesh = temp_mesh
98
+ mesh.export(temp_file_name)
99
+ mesh = pymeshlab.MeshSet()
100
+ mesh.load_new_mesh(temp_file_name)
101
+ if os.path.exists(temp_file_name):
102
+ os.remove(temp_file_name)
103
+
104
+ return mesh
105
+
106
+
107
+ def export_mesh(input, output):
108
+ if isinstance(input, pymeshlab.MeshSet):
109
+ mesh = output
110
+ elif isinstance(input, Latent2MeshOutput):
111
+ output = Latent2MeshOutput()
112
+ output.mesh_v = output.current_mesh().vertex_matrix()
113
+ output.mesh_f = output.current_mesh().face_matrix()
114
+ mesh = output
115
+ else:
116
+ mesh = pymeshlab2trimesh(output)
117
+ return mesh
118
+
119
+
120
+ def import_mesh(mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str]) -> pymeshlab.MeshSet:
121
+ if isinstance(mesh, str):
122
+ mesh = load_mesh(mesh)
123
+ elif isinstance(mesh, Latent2MeshOutput):
124
+ mesh = pymeshlab.MeshSet()
125
+ mesh_pymeshlab = pymeshlab.Mesh(vertex_matrix=mesh.mesh_v, face_matrix=mesh.mesh_f)
126
+ mesh.add_mesh(mesh_pymeshlab, "converted_mesh")
127
+
128
+ if isinstance(mesh, (trimesh.Trimesh, trimesh.scene.Scene)):
129
+ mesh = trimesh2pymeshlab(mesh)
130
+
131
+ return mesh
132
+
133
+
134
+ class FaceReducer:
135
+ def __call__(
136
+ self,
137
+ mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str],
138
+ max_facenum: int = 40000
139
+ ) -> Union[pymeshlab.MeshSet, trimesh.Trimesh]:
140
+ ms = import_mesh(mesh)
141
+ ms = reduce_face(ms, max_facenum=max_facenum)
142
+ mesh = export_mesh(mesh, ms)
143
+ return mesh
144
+
145
+
146
+ class FloaterRemover:
147
+ def __call__(
148
+ self,
149
+ mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str],
150
+ ) -> Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput]:
151
+ ms = import_mesh(mesh)
152
+ ms = remove_floater(ms)
153
+ mesh = export_mesh(mesh, ms)
154
+ return mesh
155
+
156
+
157
+ class DegenerateFaceRemover:
158
+ def __call__(
159
+ self,
160
+ mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str],
161
+ ) -> Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput]:
162
+ ms = import_mesh(mesh)
163
+
164
+ temp_file = tempfile.NamedTemporaryFile(suffix='.ply', delete=True)
165
+ temp_file.close()
166
+ temp_file_name = temp_file.name
167
+
168
+ ms.save_current_mesh(temp_file_name)
169
+ ms = pymeshlab.MeshSet()
170
+ ms.load_new_mesh(temp_file_name)
171
+ if os.path.exists(temp_file_name):
172
+ os.remove(temp_file_name)
173
+
174
+ mesh = export_mesh(mesh, ms)
175
+ return mesh
hy3dgen/shapegen/preprocessors.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source Model Licensed under the Apache License Version 2.0
2
+ # and Other Licenses of the Third-Party Components therein:
3
+ # The below Model in this distribution may have been modified by THL A29 Limited
4
+ # ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
5
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
6
+ # The below software and/or models in this distribution may have been
7
+ # modified by THL A29 Limited ("Tencent Modifications").
8
+ # All Tencent Modifications are Copyright (C) THL A29 Limited.
9
+
10
+ # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
11
+ # except for the third-party components listed below.
12
+ # Hunyuan 3D does not impose any additional limitations beyond what is outlined
13
+ # in the repsective licenses of these third-party components.
14
+ # Users must comply with all terms and conditions of original licenses of these third-party
15
+ # components and must ensure that the usage of the third party components adheres to
16
+ # all relevant laws and regulations.
17
+
18
+ # For avoidance of doubts, Hunyuan 3D means the large language models and
19
+ # their software and algorithms, including trained model weights, parameters (including
20
+ # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
21
+ # fine-tuning enabling code and other elements of the foregoing made publicly available
22
+ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
23
+
24
+ import cv2
25
+ import numpy as np
26
+ import torch
27
+ from PIL import Image
28
+ from einops import repeat, rearrange
29
+
30
+
31
+ def array_to_tensor(np_array):
32
+ image_pt = torch.tensor(np_array).float()
33
+ image_pt = image_pt / 255 * 2 - 1
34
+ image_pt = rearrange(image_pt, "h w c -> c h w")
35
+ image_pts = repeat(image_pt, "c h w -> b c h w", b=1)
36
+ return image_pts
37
+
38
+
39
+ class ImageProcessorV2:
40
+ def __init__(self, size=512, border_ratio=None):
41
+ self.size = size
42
+ self.border_ratio = border_ratio
43
+
44
+ @staticmethod
45
+ def recenter(image, border_ratio: float = 0.2):
46
+ """ recenter an image to leave some empty space at the image border.
47
+
48
+ Args:
49
+ image (ndarray): input image, float/uint8 [H, W, 3/4]
50
+ mask (ndarray): alpha mask, bool [H, W]
51
+ border_ratio (float, optional): border ratio, image will be resized to (1 - border_ratio). Defaults to 0.2.
52
+
53
+ Returns:
54
+ ndarray: output image, float/uint8 [H, W, 3/4]
55
+ """
56
+
57
+ if image.shape[-1] == 4:
58
+ mask = image[..., 3]
59
+ else:
60
+ mask = np.ones_like(image[..., 0:1]) * 255
61
+ image = np.concatenate([image, mask], axis=-1)
62
+ mask = mask[..., 0]
63
+
64
+ H, W, C = image.shape
65
+
66
+ size = max(H, W)
67
+ result = np.zeros((size, size, C), dtype=np.uint8)
68
+
69
+ coords = np.nonzero(mask)
70
+ x_min, x_max = coords[0].min(), coords[0].max()
71
+ y_min, y_max = coords[1].min(), coords[1].max()
72
+ h = x_max - x_min
73
+ w = y_max - y_min
74
+ if h == 0 or w == 0:
75
+ raise ValueError('input image is empty')
76
+ desired_size = int(size * (1 - border_ratio))
77
+ scale = desired_size / max(h, w)
78
+ h2 = int(h * scale)
79
+ w2 = int(w * scale)
80
+ x2_min = (size - h2) // 2
81
+ x2_max = x2_min + h2
82
+
83
+ y2_min = (size - w2) // 2
84
+ y2_max = y2_min + w2
85
+
86
+ result[x2_min:x2_max, y2_min:y2_max] = cv2.resize(image[x_min:x_max, y_min:y_max], (w2, h2),
87
+ interpolation=cv2.INTER_AREA)
88
+
89
+ bg = np.ones((result.shape[0], result.shape[1], 3), dtype=np.uint8) * 255
90
+ # bg = np.zeros((result.shape[0], result.shape[1], 3), dtype=np.uint8) * 255
91
+ mask = result[..., 3:].astype(np.float32) / 255
92
+ result = result[..., :3] * mask + bg * (1 - mask)
93
+
94
+ mask = mask * 255
95
+ result = result.clip(0, 255).astype(np.uint8)
96
+ mask = mask.clip(0, 255).astype(np.uint8)
97
+ return result, mask
98
+
99
+ def __call__(self, image, border_ratio=0.15, to_tensor=True, return_mask=False, **kwargs):
100
+ if self.border_ratio is not None:
101
+ border_ratio = self.border_ratio
102
+ print(f"Using border_ratio from init: {border_ratio}")
103
+ if isinstance(image, str):
104
+ image = cv2.imread(image, cv2.IMREAD_UNCHANGED)
105
+ image, mask = self.recenter(image, border_ratio=border_ratio)
106
+ image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
107
+ elif isinstance(image, Image.Image):
108
+ image = np.asarray(image)
109
+ image, mask = self.recenter(image, border_ratio=border_ratio)
110
+
111
+ image = cv2.resize(image, (self.size, self.size), interpolation=cv2.INTER_CUBIC)
112
+ mask = cv2.resize(mask, (self.size, self.size), interpolation=cv2.INTER_NEAREST)
113
+ mask = mask[..., np.newaxis]
114
+
115
+ if to_tensor:
116
+ image = array_to_tensor(image)
117
+ mask = array_to_tensor(mask)
118
+ if return_mask:
119
+ return image, mask
120
+ return image
121
+
122
+
123
+ IMAGE_PROCESSORS = {
124
+ "v2": ImageProcessorV2,
125
+ }
126
+
127
+ DEFAULT_IMAGEPROCESSOR = 'v2'