+#####
[Weiyu Li*1,2](https://wyysf-98.github.io/), Jiarui Liu*1,2, Hongyu Yan*1,2, [Rui Chen1,2](https://aruichen.github.io/), [Yixun Liang2,3](https://yixunliang.github.io/), [Xuelin Chen4](https://xuelin-chen.github.io/), [Ping Tan1,2](https://ece.hkust.edu.hk/pingtan), [Xiaoxiao Long1,2](https://www.xxlong.site/)
+#####
1HKUST, 2LightIllusions, 3HKUST(GZ), 4Tencent AI Lab
+
+
+# Usage
+
+To use the model, please refer to the [official repository](https://github.com/wyysf-98/CraftsMan) for installation and usage instructions.
+
+```
+
+from craftsman import CraftsManPipeline
+import torch
+
+pipeline = CraftsManPipeline.from_pretrained("./ckpts/craftsman-v1-5", device="cuda:0", torch_dtype=torch.float32) # load from local ckpt
+mesh = pipeline("https://pub-f9073a756ec645d692ce3d171c2e1232.r2.dev/data/werewolf.png").meshes[0]
+mesh.export("werewolf.obj")
+
+```
+
+## 🔥🔥🔥 News!!
+
+* Nov 16, 2024: 💬 We release the CraftsMan-v1-5
+
+
+## 📑 Open-source Plan
+
+- [x] Inference
+- [x] Checkpoints
+- [x] Training
+- [ ] ComfyUI
+
+## 🎉 **CraftMan-v1-5 Architecture**
+
+
+
+
+
+
+## Get Started
+
+#### Begin by cloning the repository:
+
+```shell
+git clone https://github.com/wyysf-98/CraftsMan
+cd CraftsMan
+```
+
+#### Installation Guide for Linux
+
+We provide an env_install.sh script file for setting up environment.
+
+```
+# step 1, create conda env
+conda create -n CraftsMan python=3.10
+conda activate CraftsMan
+
+
+# step 2. install torch realated package
+conda install -c pytorch pytorch=2.3.0 torchvision=0.18.0 cudatoolkit=11.8
+
+# step 3. install other packages
+pip install -r docker/requirements.txt
+```
+
+
+
+#### Using Gradio
+
+We have prepared a gradio demo for you to try out the model. You can run the following command to start the demo.
+
+```shell
+# std
+python3 gradio.py
+```
+
+Then the demo can be accessed through the output link.
+
+
+## Citation
+
+If you found this repository helpful, please cite our report:
+```bibtex
+@misc{li2024craftsman,
+title = {CraftsMan: High-fidelity Mesh Generation with 3D Native Generation and Interactive Geometry Refiner},
+author = {Weiyu Li and Jiarui Liu and Rui Chen and Yixun Liang and Xuelin Chen and Ping Tan and Xiaoxiao Long},
+year = {2024},
+archivePrefix = {arXiv preprint arXiv:2405.14979},
+primaryClass = {cs.CG}
+}
+```
+
+
+# License
+
+[creativeml-openrail-m](https://raw.githubusercontent.com/CompVis/stable-diffusion/refs/heads/main/LICENSE)
\ No newline at end of file
diff --git a/ckpts/craftsman-v1-5 b/ckpts/craftsman-v1-5
new file mode 160000
index 0000000000000000000000000000000000000000..9a5e9189c2dfab20cf838885dd6acaf99b41844e
--- /dev/null
+++ b/ckpts/craftsman-v1-5
@@ -0,0 +1 @@
+Subproject commit 9a5e9189c2dfab20cf838885dd6acaf99b41844e
diff --git a/craftsman/__pycache__/__init__.cpython-310.pyc b/craftsman/__pycache__/__init__.cpython-310.pyc
index 5a51ef6f8fe535a1397913052837454aaa7d53f7..a8785076ecfc8e97f756e760ae2bb5095879ffa7 100644
Binary files a/craftsman/__pycache__/__init__.cpython-310.pyc and b/craftsman/__pycache__/__init__.cpython-310.pyc differ
diff --git a/craftsman/__pycache__/__init__.cpython-311.pyc b/craftsman/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fc69ac74bbcfa36fde97eb2762a774a67d228bc5
Binary files /dev/null and b/craftsman/__pycache__/__init__.cpython-311.pyc differ
diff --git a/craftsman/__pycache__/pipeline.cpython-310.pyc b/craftsman/__pycache__/pipeline.cpython-310.pyc
index 11f8f99c630037594255abc91320ca76286dcedb..e4fdf0d89b5c80301e20cb0b7a894bf4331efa14 100644
Binary files a/craftsman/__pycache__/pipeline.cpython-310.pyc and b/craftsman/__pycache__/pipeline.cpython-310.pyc differ
diff --git a/craftsman/__pycache__/pipeline.cpython-311.pyc b/craftsman/__pycache__/pipeline.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..80f55005a19035f7b9fa5f929d493aa8e071f7d6
Binary files /dev/null and b/craftsman/__pycache__/pipeline.cpython-311.pyc differ
diff --git a/craftsman/data/__pycache__/Objaverse.cpython-310.pyc b/craftsman/data/__pycache__/Objaverse.cpython-310.pyc
index e386501d4e556a0b58284856146c0cc902fe116c..5a3e4cd7808df822b968a71f2d68a14dc29933d7 100644
Binary files a/craftsman/data/__pycache__/Objaverse.cpython-310.pyc and b/craftsman/data/__pycache__/Objaverse.cpython-310.pyc differ
diff --git a/craftsman/data/__pycache__/__init__.cpython-310.pyc b/craftsman/data/__pycache__/__init__.cpython-310.pyc
index a3984e5da4f87d7c9daa51190557b2d1b52905ae..0337eaaf4af9d87488de122e627d45f4ae88ecef 100644
Binary files a/craftsman/data/__pycache__/__init__.cpython-310.pyc and b/craftsman/data/__pycache__/__init__.cpython-310.pyc differ
diff --git a/craftsman/data/__pycache__/base.cpython-310.pyc b/craftsman/data/__pycache__/base.cpython-310.pyc
index 598c77443204d9b4fedab25daee82170de9c59e0..e04ec395e61d3acc020c5ad560f1e7594055799c 100644
Binary files a/craftsman/data/__pycache__/base.cpython-310.pyc and b/craftsman/data/__pycache__/base.cpython-310.pyc differ
diff --git a/craftsman/data/base.py b/craftsman/data/base.py
index a89fcbf466c1af978e498fb383eb52df10bc8d82..8b530cc41c75f9beab62c01760537ea3c8a60f9d 100755
--- a/craftsman/data/base.py
+++ b/craftsman/data/base.py
@@ -53,7 +53,7 @@ class BaseDataModuleConfig:
# for occupancy and sdf data
n_samples: int = 4096 # number of points in input point cloud
upsample_ratio: int = 1 # upsample ratio for input point cloud
- sampling_strategy: str = "random" # sampling strategy for input point cloud
+ sampling_strategy: Optional[str] = None # sampling strategy for input point cloud
scale: float = 1.0 # scale of the input point cloud and target supervision
load_supervision: bool = True # whether to load supervision
supervision_type: str = "occupancy" # occupancy, sdf, tsdf
@@ -70,6 +70,8 @@ class BaseDataModuleConfig:
idx: Optional[List[int]] = None # index of the image to load
n_views: int = 1 # number of views
marign_pix_dis: int = 30 # margin of the bounding box
+ batch_size: int = 32
+ num_workers: int = 8
class BaseDataset(Dataset):
@@ -78,7 +80,7 @@ class BaseDataset(Dataset):
self.cfg: BaseDataModuleConfig = cfg
self.split = split
- self.uids = json.load(open(f'{cfg.root_dir}/{split}.json'))
+ self.uids = json.load(open(f'{cfg.local_dir}/{split}.json'))
print(f"Loaded {len(self.uids)} {split} uids")
def __len__(self):
@@ -94,10 +96,7 @@ class BaseDataset(Dataset):
surface = np.concatenate([surface, normal], axis=1)
elif self.cfg.geo_data_type == "sdf":
# for sdf data with our own format
- if re.match(r"\.\.", self.uids[index]):
- data = np.load(f'{self.cfg.geo_data_path}/{self.uids[index]}.npz')
- else:
- data = np.load(f'{self.uids[index]}.npz')
+ data = np.load(f'{self.cfg.geo_data_path}/{self.uids[index]}.npz')
# for input point cloud
surface = data["surface"]
else:
@@ -112,6 +111,8 @@ class BaseDataset(Dataset):
import fpsample
kdline_fps_samples_idx = fpsample.bucket_fps_kdline_sampling(surface[:, :3], self.cfg.n_samples, h=5)
surface = surface[kdline_fps_samples_idx]
+ elif self.cfg.sampling_strategy is None:
+ pass
else:
raise NotImplementedError(f"sampling strategy {self.cfg.sampling_strategy} not implemented")
# rescale data
@@ -189,9 +190,9 @@ class BaseDataset(Dataset):
sel_idx = random.choice(self.cfg.idx)
ret["sel_image_idx"] = sel_idx
if self.cfg.image_type == "rgb":
- img_path = f'{self.cfg.image_data_path}/' + "/".join(self.uids[index].split('/')[-2:]) + f"/{'{:04d}'.format(sel_idx)}_rgb.png"
+ img_path = f'{self.cfg.image_data_path}/' + "/".join(self.uids[index].split('/')[-2:]) + f"/{'{:04d}'.format(sel_idx)}_rgb.jpeg"
elif self.cfg.image_type == "normal":
- img_path = f'{self.cfg.image_data_path}/' + "/".join(self.uids[index].split('/')[-2:]) + f"/{'{:04d}'.format(sel_idx)}_normal.png"
+ img_path = f'{self.cfg.image_data_path}/' + "/".join(self.uids[index].split('/')[-2:]) + f"/{'{:04d}'.format(sel_idx)}_normal.jpeg"
ret["image"], ret["mask"] = _load_single_image(img_path, background_color, self.cfg.marign_pix_dis)
else:
diff --git a/craftsman/models/__pycache__/__init__.cpython-310.pyc b/craftsman/models/__pycache__/__init__.cpython-310.pyc
index 6f2a4aa9900ea2774cdf420f186133bc7841c02a..11adffc179d52bd2723fda25a7c93449622bc2cb 100644
Binary files a/craftsman/models/__pycache__/__init__.cpython-310.pyc and b/craftsman/models/__pycache__/__init__.cpython-310.pyc differ
diff --git a/craftsman/models/autoencoders/__pycache__/__init__.cpython-310.pyc b/craftsman/models/autoencoders/__pycache__/__init__.cpython-310.pyc
index 816e8317dfc4c21814e583abccec00f656238ef0..4dc405b92470faf12abfefebf27d6639c0c42d40 100644
Binary files a/craftsman/models/autoencoders/__pycache__/__init__.cpython-310.pyc and b/craftsman/models/autoencoders/__pycache__/__init__.cpython-310.pyc differ
diff --git a/craftsman/models/autoencoders/__pycache__/michelangelo_autoencoder.cpython-310.pyc b/craftsman/models/autoencoders/__pycache__/michelangelo_autoencoder.cpython-310.pyc
index 6dc2375a7d7d199b2d62a491b5a98b85c9587634..842c373c10f5f0f6ec7f42e2fbde457bd549bda2 100644
Binary files a/craftsman/models/autoencoders/__pycache__/michelangelo_autoencoder.cpython-310.pyc and b/craftsman/models/autoencoders/__pycache__/michelangelo_autoencoder.cpython-310.pyc differ
diff --git a/craftsman/models/conditional_encoders/__pycache__/__init__.cpython-310.pyc b/craftsman/models/conditional_encoders/__pycache__/__init__.cpython-310.pyc
index 5cdb9960006c8152e0087a74620d445905ca85c4..a95057bd6e6ca2a1ba21b989d773dd4832be6e79 100644
Binary files a/craftsman/models/conditional_encoders/__pycache__/__init__.cpython-310.pyc and b/craftsman/models/conditional_encoders/__pycache__/__init__.cpython-310.pyc differ
diff --git a/craftsman/models/conditional_encoders/__pycache__/base.cpython-310.pyc b/craftsman/models/conditional_encoders/__pycache__/base.cpython-310.pyc
index 1bd6963909f1f914109d6babaea136f7b82e87f9..1ca2aefda5cd18199e0848691a3271dba7fac3eb 100644
Binary files a/craftsman/models/conditional_encoders/__pycache__/base.cpython-310.pyc and b/craftsman/models/conditional_encoders/__pycache__/base.cpython-310.pyc differ
diff --git a/craftsman/models/conditional_encoders/__pycache__/cond_encoder.cpython-310.pyc b/craftsman/models/conditional_encoders/__pycache__/cond_encoder.cpython-310.pyc
index 1aa092ceb6e5e4d1b03c3ad1c906ba5a44f89763..45bca7ab683148d63f794b2f29a550371805ee1e 100644
Binary files a/craftsman/models/conditional_encoders/__pycache__/cond_encoder.cpython-310.pyc and b/craftsman/models/conditional_encoders/__pycache__/cond_encoder.cpython-310.pyc differ
diff --git a/craftsman/models/conditional_encoders/clip/__pycache__/modeling_clip.cpython-310.pyc b/craftsman/models/conditional_encoders/clip/__pycache__/modeling_clip.cpython-310.pyc
index 9b2783cc2194ea5717c0037b3b33547d75372724..45e0f8eff2e57853db37a7bbacb4bfbc213d03a2 100644
Binary files a/craftsman/models/conditional_encoders/clip/__pycache__/modeling_clip.cpython-310.pyc and b/craftsman/models/conditional_encoders/clip/__pycache__/modeling_clip.cpython-310.pyc differ
diff --git a/craftsman/models/conditional_encoders/clip/__pycache__/modeling_conditional_clip.cpython-310.pyc b/craftsman/models/conditional_encoders/clip/__pycache__/modeling_conditional_clip.cpython-310.pyc
index 4f0517d090506cdf564f4e2c281cc1f4e7db2667..982cdf8a3dffaba1863b7eb59dcfe55923e82a06 100644
Binary files a/craftsman/models/conditional_encoders/clip/__pycache__/modeling_conditional_clip.cpython-310.pyc and b/craftsman/models/conditional_encoders/clip/__pycache__/modeling_conditional_clip.cpython-310.pyc differ
diff --git a/craftsman/models/conditional_encoders/cond_encoder.py b/craftsman/models/conditional_encoders/cond_encoder.py
old mode 100644
new mode 100755
index 1faa5eb44fc421e84af7e8f5c54a4c4389396464..85c75e249bcda50d33c2dff9ea57c4f5b3ff3136
--- a/craftsman/models/conditional_encoders/cond_encoder.py
+++ b/craftsman/models/conditional_encoders/cond_encoder.py
@@ -46,7 +46,6 @@ class CondEmbedder(BaseEmbedder):
enable_gradient_checkpointing: bool = False
embeds_fusion_mode: int = 1 # 0: sum | 1: concat
linear_proj_init: str = "constant"
- text_model_type: str = "clip"
text_max_length: int = 77
image_size_clip: int = 224
image_size_dino: int = 224
@@ -277,29 +276,9 @@ class CondEmbedder(BaseEmbedder):
else:
return vision_outputs.last_hidden_state
- def post_process_embeds(self, text_embeds, visual_embeds):
- clip_embeds, dino_embeds = visual_embeds.chunk(2, dim=2)
- if self.cfg.normalize_embeds:
- # post-process the text/visual embeds
- if text_embeds is not None:
- text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True)
- if clip_embeds is not None:
- clip_embeds = clip_embeds / clip_embeds.norm(dim=-1, keepdim=True)
- if dino_embeds is not None:
- dino_embeds = dino_embeds / dino_embeds.norm(dim=-1, keepdim=True)
-
- assert text_embeds is not None or dino_embeds is not None or clip_embeds is not None
-
- if text_embeds is not None and visual_embeds is not None:
- return torch.cat([text_embeds, visual_embeds], dim=1)
- elif text_embeds is not None:
- return text_embeds
- else:
- return visual_embeds
-
def encode_image(self, images: Iterable[Optional[ImageType]], cameras: Optional[torch.Tensor] = None, force_none_camera_embeds: bool = False, return_dict: bool = False, **kwargs) -> torch.FloatTensor:
clip_embeds = self.encode_image_clip(images, cameras)
dino_embeds = self.encode_image_dino(images, cameras)
dino_embeds = self.linear_proj(dino_embeds)
visual_embeds = torch.cat([clip_embeds, dino_embeds], dim=1)
- return visual_embeds
\ No newline at end of file
+ return visual_embeds
diff --git a/craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_conditional_dinov2.cpython-310.pyc b/craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_conditional_dinov2.cpython-310.pyc
index a8cc31eaf7786f8da942a578c437e98187aead95..e5febc512824e0aae8e91be9763a93fc54b69ddd 100644
Binary files a/craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_conditional_dinov2.cpython-310.pyc and b/craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_conditional_dinov2.cpython-310.pyc differ
diff --git a/craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_dinov2.cpython-310.pyc b/craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_dinov2.cpython-310.pyc
index ddafb045cd5591010924c1b676245338418afee4..2e015e8893c63efecead9dd44e7710d08de78ef5 100644
Binary files a/craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_dinov2.cpython-310.pyc and b/craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_dinov2.cpython-310.pyc differ
diff --git a/craftsman/models/denoisers/__pycache__/__init__.cpython-310.pyc b/craftsman/models/denoisers/__pycache__/__init__.cpython-310.pyc
index 7403ff05f007ce3dcd68a6b68853f6d9e2b9c765..821a33cc4bcdcbcd3b0bb42e5108209fdd5f10e5 100644
Binary files a/craftsman/models/denoisers/__pycache__/__init__.cpython-310.pyc and b/craftsman/models/denoisers/__pycache__/__init__.cpython-310.pyc differ
diff --git a/craftsman/models/denoisers/__pycache__/pixart_denoiser.cpython-310.pyc b/craftsman/models/denoisers/__pycache__/pixart_denoiser.cpython-310.pyc
index 64aab3f6a63158b80efed5c4714f60ef5cbd37c6..19b1973d0fdb7f557a328cea64758115cbee37a1 100644
Binary files a/craftsman/models/denoisers/__pycache__/pixart_denoiser.cpython-310.pyc and b/craftsman/models/denoisers/__pycache__/pixart_denoiser.cpython-310.pyc differ
diff --git a/craftsman/models/denoisers/__pycache__/utils.cpython-310.pyc b/craftsman/models/denoisers/__pycache__/utils.cpython-310.pyc
index 8a5b02a2bf10278ba7fc10ac184473d9712092ae..59f9f8adaf4e3ea92c8ef9a2bb65a733b26e9e33 100644
Binary files a/craftsman/models/denoisers/__pycache__/utils.cpython-310.pyc and b/craftsman/models/denoisers/__pycache__/utils.cpython-310.pyc differ
diff --git a/craftsman/models/denoisers/pixart_denoiser.py b/craftsman/models/denoisers/pixart_denoiser.py
index b34f1caa2d4bf6d8385106f619dcad6314cb7281..4c64be12d1e680a08f1917d3a8d1f6c2c93dc1f4 100755
--- a/craftsman/models/denoisers/pixart_denoiser.py
+++ b/craftsman/models/denoisers/pixart_denoiser.py
@@ -25,15 +25,11 @@ class PixArtDinoDenoiser(BaseModule):
context_dim: int = 1024
n_views: int = 1
context_ln: bool = True
- skip_ln: bool = False
init_scale: float = 0.25
use_checkpoint: bool = False
drop_path: float = 0.
- variance_type: str = ""
- img_pos_embed: bool = False
clip_weight: float = 1.0
dino_weight: float = 1.0
- dit_block: str = ""
cfg: Config
@@ -63,9 +59,8 @@ class PixArtDinoDenoiser(BaseModule):
init_scale = self.cfg.init_scale * math.sqrt(1.0 / self.cfg.width)
drop_path = [x.item() for x in torch.linspace(0, self.cfg.drop_path, self.cfg.layers)]
- ditblock = getattr(importlib.import_module("craftsman.models.denoisers.utils"), self.cfg.dit_block)
self.blocks = nn.ModuleList([
- ditblock(
+ DiTBlock(
width=self.cfg.width,
heads=self.cfg.heads,
init_scale=init_scale,
@@ -82,11 +77,7 @@ class PixArtDinoDenoiser(BaseModule):
)
# final layer
- if self.cfg.variance_type.upper() in ["LEARNED", "LEARNED_RANGE"]:
- self.output_channels = self.cfg.output_channels * 2
- else:
- self.output_channels = self.cfg.output_channels
- self.final_layer = T2IFinalLayer(self.cfg.width, self.output_channels)
+ self.final_layer = T2IFinalLayer(self.cfg.width, self.cfg.output_channels)
self.identity_initialize()
@@ -99,17 +90,6 @@ class PixArtDinoDenoiser(BaseModule):
self.denoiser_ckpt[k.replace('denoiser_model.', '')] = v
self.load_state_dict(self.denoiser_ckpt, strict=False)
- def forward_with_dpmsolver(self, model_input, timestep, context):
- """
- dpm solver donnot need variance prediction
- """
- # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
- model_out = self.forward(model_input, timestep, context)
- if self.cfg.variance_type.upper() in ["LEARNED", "LEARNED_RANGE"]:
- return model_out.chunk(2, dim=-1)[0]
- else:
- return model_out
-
def identity_initialize(self):
for block in self.blocks:
nn.init.constant_(block.attn.c_proj.weight, 0)
diff --git a/craftsman/models/denoisers/utils.py b/craftsman/models/denoisers/utils.py
old mode 100644
new mode 100755
index 9e29cbbf784d8cfa0683eda7b7e72b0cf68356ce..ab03bbe930cff27fab9c45ffbf25220551e024cc
--- a/craftsman/models/denoisers/utils.py
+++ b/craftsman/models/denoisers/utils.py
@@ -10,126 +10,6 @@ from timm.models.layers import DropPath
from craftsman.models.transformers.utils import MLP
from craftsman.models.transformers.attention import MultiheadAttention, MultiheadCrossAttention
-class PatchEmbed(nn.Module):
- """ 2D Image to Patch Embedding
- """
- def __init__(
- self,
- patch_size=16,
- in_chans=3,
- embed_dim=768,
- norm_layer=None,
- flatten=True,
- bias=True,
- ):
- super().__init__()
- patch_size = to_2tuple(patch_size)
- self.patch_size = patch_size
- self.flatten = flatten
- self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias)
- self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
-
- def forward(self, x):
- x = self.proj(x)
- if self.flatten:
- x = x.flatten(2).transpose(1, 2) # BCHW -> BNC
- x = self.norm(x)
- return x
-
-class DiTBlock(nn.Module):
- """
- A PixArt block with adaptive layer norm (adaLN-single) conditioning.
- """
-
- def __init__(self, width, heads, init_scale=1.0, qkv_bias=True, use_flash=True, drop_path=0.0):
- super().__init__()
- self.norm1 = nn.LayerNorm(width, elementwise_affine=True, eps=1e-6)
- self.attn = MultiheadAttention(
- n_ctx=None,
- width=width,
- heads=heads,
- init_scale=init_scale,
- qkv_bias=qkv_bias,
- use_flash=use_flash
- )
- self.cross_attn = MultiheadCrossAttention(
- n_data=None,
- width=width,
- heads=heads,
- data_width=None,
- init_scale=init_scale,
- qkv_bias=qkv_bias,
- use_flash=use_flash,
- )
-
- self.norm2 = nn.LayerNorm(width, elementwise_affine=True, eps=1e-6)
-
- self.mlp = MLP(width=width, init_scale=init_scale)
- self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
- self.scale_shift_table = nn.Parameter(torch.randn(6, width) / width ** 0.5)
-
- def forward(self, x, visual_cond, t, **kwargs):
- B, N, C = x.shape
-
- shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.scale_shift_table[None] + t.reshape(B, 6, -1)).chunk(6, dim=1)
- x = x + self.drop_path(gate_msa * self.attn(t2i_modulate(self.norm1(x), shift_msa, scale_msa)).reshape(B, N, C))
- x = x + self.cross_attn(x, visual_cond)
- x = x + self.drop_path(gate_mlp * self.mlp(t2i_modulate(self.norm2(x), shift_mlp, scale_mlp)))
-
- return x
-
-class DiTBlock_text(nn.Module):
- """
- A PixArt block with adaptive layer norm (adaLN-single) conditioning.
- """
-
- def __init__(self, width, heads, init_scale=1.0, qkv_bias=True, use_flash=True, drop_path=0.0):
- super().__init__()
- self.norm1 = nn.LayerNorm(width, elementwise_affine=True, eps=1e-6)
- self.attn = MultiheadAttention(
- n_ctx=None,
- width=width,
- heads=heads,
- init_scale=init_scale,
- qkv_bias=qkv_bias,
- use_flash=use_flash
- )
- self.cross_attn = MultiheadCrossAttention(
- n_data=None,
- width=width,
- heads=heads,
- data_width=None,
- init_scale=init_scale,
- qkv_bias=qkv_bias,
- use_flash=use_flash,
- )
-
- self.cross_attn_extra = MultiheadCrossAttention(
- n_data=None,
- width=width,
- heads=heads,
- data_width=None,
- init_scale=init_scale,
- qkv_bias=qkv_bias,
- use_flash=use_flash,
- )
- self.norm2 = nn.LayerNorm(width, elementwise_affine=True, eps=1e-6)
-
- self.mlp = MLP(width=width, init_scale=init_scale)
- self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
- self.scale_shift_table = nn.Parameter(torch.randn(6, width) / width ** 0.5)
-
- def forward(self, x, visual_cond, text_cond, t, **kwargs):
- B, N, C = x.shape
-
- shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.scale_shift_table[None] + t.reshape(B, 6, -1)).chunk(6, dim=1)
- x = x + self.drop_path(gate_msa * self.attn(t2i_modulate(self.norm1(x), shift_msa, scale_msa)).reshape(B, N, C))
- x = x + self.cross_attn(x, visual_cond)
- x = x + self.cross_attn_extra(x, text_cond)
- x = x + self.drop_path(gate_mlp * self.mlp(t2i_modulate(self.norm2(x), shift_mlp, scale_mlp)))
-
- return x
-
class DiTBlock(nn.Module):
"""
A DiT block with adaptive layer norm (adaLN-single) conditioning.
@@ -174,11 +54,6 @@ class DiTBlock(nn.Module):
def t2i_modulate(x, shift, scale):
return x * (1 + scale) + shift
-# def t2i_modulate(x, shift, scale):
-# a = torch.ones_like(scale)
-# a[..., 768:] = 0
-# return x * (a + scale) + shift
-
def auto_grad_checkpoint(module, *args, **kwargs):
if getattr(module, 'grad_checkpointing', False):
if not isinstance(module, Iterable):
@@ -268,63 +143,4 @@ class T2IFinalLayer(nn.Module):
shift, scale = (self.scale_shift_table[None] + t[:, None]).chunk(2, dim=1)
x = t2i_modulate(self.norm_final(x), shift, scale)
x = self.linear(x)
- return x
-
-def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
- """
- embed_dim: output dimension for each position
- pos: a list of positions to be encoded: size (M,)
- out: (M, D)
- """
- assert embed_dim % 2 == 0
- omega = np.arange(embed_dim // 2, dtype=np.float64)
- omega /= embed_dim / 2.
- omega = 1. / 10000 ** omega # (D/2,)
-
- pos = pos.reshape(-1) # (M,)
- out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product
-
- emb_sin = np.sin(out) # (M, D/2)
- emb_cos = np.cos(out) # (M, D/2)
-
- emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
- return emb
-
-def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
- assert embed_dim % 2 == 0
-
- # use half of dimensions to encode grid_h
- emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)
- emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)
-
- emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
- return emb
-
-def _ntuple(n):
- def parse(x):
- if isinstance(x, Iterable) and not isinstance(x, str):
- return x
- return tuple(repeat(x, n))
- return parse
-
-to_1tuple = _ntuple(1)
-to_2tuple = _ntuple(2)
-
-def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0, pe_interpolation=1.0, base_size=16):
- """
- grid_size: int of the grid height and width
- return:
- pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
- """
- if isinstance(grid_size, int):
- grid_size = to_2tuple(grid_size)
- grid_h = np.arange(grid_size[0], dtype=np.float32) / (grid_size[0]/base_size) / pe_interpolation
- grid_w = np.arange(grid_size[1], dtype=np.float32) / (grid_size[1]/base_size) / pe_interpolation
- grid = np.meshgrid(grid_w, grid_h) # here w goes first
- grid = np.stack(grid, axis=0)
- grid = grid.reshape([2, 1, grid_size[1], grid_size[0]])
-
- pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
- if cls_token and extra_tokens > 0:
- pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
- return pos_embed
\ No newline at end of file
+ return x
\ No newline at end of file
diff --git a/craftsman/models/geometry/__pycache__/__init__.cpython-310.pyc b/craftsman/models/geometry/__pycache__/__init__.cpython-310.pyc
index 5e8a39cd825569e0bbe4ba22f7241f6cbedc7069..97ec08e72b9bc4b93582d20c4790036fd0dd42ce 100644
Binary files a/craftsman/models/geometry/__pycache__/__init__.cpython-310.pyc and b/craftsman/models/geometry/__pycache__/__init__.cpython-310.pyc differ
diff --git a/craftsman/models/geometry/__pycache__/base.cpython-310.pyc b/craftsman/models/geometry/__pycache__/base.cpython-310.pyc
index 54b91b362964b3f4242d48775f76166c08e6f05f..001496e810e820ce5e93dcbc470a3e43cacda8f6 100644
Binary files a/craftsman/models/geometry/__pycache__/base.cpython-310.pyc and b/craftsman/models/geometry/__pycache__/base.cpython-310.pyc differ
diff --git a/craftsman/models/geometry/__pycache__/utils.cpython-310.pyc b/craftsman/models/geometry/__pycache__/utils.cpython-310.pyc
index 759c2dac939bd6029335df19094e417576e369e9..74b1e26189f3c644f9a597d481406a61c116895e 100644
Binary files a/craftsman/models/geometry/__pycache__/utils.cpython-310.pyc and b/craftsman/models/geometry/__pycache__/utils.cpython-310.pyc differ
diff --git a/craftsman/models/transformers/__pycache__/attention.cpython-310.pyc b/craftsman/models/transformers/__pycache__/attention.cpython-310.pyc
index 86aaa2ab10591ad95b7931524ed4ced417f7c8ac..a895397929ea800378c5387035a121236bf9567a 100644
Binary files a/craftsman/models/transformers/__pycache__/attention.cpython-310.pyc and b/craftsman/models/transformers/__pycache__/attention.cpython-310.pyc differ
diff --git a/craftsman/models/transformers/__pycache__/perceiver_1d.cpython-310.pyc b/craftsman/models/transformers/__pycache__/perceiver_1d.cpython-310.pyc
index 7b0fc923ea0ee99e65f6e38b6a3e98df4ba5dd08..06bc047d40e34f7868fbec57464ff80b5030db62 100644
Binary files a/craftsman/models/transformers/__pycache__/perceiver_1d.cpython-310.pyc and b/craftsman/models/transformers/__pycache__/perceiver_1d.cpython-310.pyc differ
diff --git a/craftsman/models/transformers/__pycache__/utils.cpython-310.pyc b/craftsman/models/transformers/__pycache__/utils.cpython-310.pyc
index ba2de6a0419f045de38bb33d2f42ef89a29a6fa1..f21f6acb7b0ac748ee2914a5cce09bef3228072a 100644
Binary files a/craftsman/models/transformers/__pycache__/utils.cpython-310.pyc and b/craftsman/models/transformers/__pycache__/utils.cpython-310.pyc differ
diff --git a/craftsman/models/transformers/attention.py b/craftsman/models/transformers/attention.py
old mode 100644
new mode 100755
index 81d179cf4e070c7a0520567b5e64dd3d7ed03792..e4623355b6d1d4d3d2c5c8827d23f8ecf4f0b0c6
--- a/craftsman/models/transformers/attention.py
+++ b/craftsman/models/transformers/attention.py
@@ -9,126 +9,6 @@ from craftsman.utils.checkpoint import checkpoint
from .utils import init_linear, MLP
from timm.models.vision_transformer import Attention
-def scaled_dot_product_gqa(
- query: Tensor,
- key: Tensor,
- value: Tensor,
- dropout: float = 0.0,
- scale: Optional[float] = None,
- mask: Optional[Tensor] = None,
- is_causal: Optional[bool] = None,
- need_weights: bool = False,
- average_attn_weights: bool = False,
- force_grouped: bool = False,
-):
- """Scaled dot product attention with support for grouped queries.
-
- Einstein notation:
- - b: batch size
- - n / s: sequence length
- - h: number of heads
- - g: number of groups
- - d: dimension of query/key/value
-
- Args:
- query: Query tensor of shape (b, n, h, d)
- key: Key tensor of shape (b, s, h, d)
- value: Value tensor of shape (b, s, h, d)
- dropout: Dropout probability (default: 0.0)
- scale: Scale factor for query (default: d_query ** 0.5)
- mask: Mask tensor of shape (b, n, s) or (b, s). If 'ndim == 2', the mask is
- applied to all 'n' rows of the attention matrix. (default: None)
- force_grouped: If True, apply grouped-query attention even if the number of
- heads is equal for query, key, and value. (default: False)
-
- Returns:
- 2-tuple of:
- - Attention output with shape (b, n, h, d)
- - (Optional) Attention weights with shape (b, h, n, s). Only returned if
- 'need_weights' is True.
- """
- if (mask is not None) and (is_causal is not None):
- raise ValueError(
- "Only one of 'mask' and 'is_causal' should be provided, but got both."
- )
- elif not query.ndim == key.ndim == value.ndim == 4:
- raise ValueError(
- f"Expected query, key, and value to be 4-dimensional, but got shapes "
- f"{query.shape}, {key.shape}, and {value.shape}."
- )
-
- # Move sequence length dimension to axis 2.
- # This makes the attention operations below *much* faster.
- query = rearrange(query, "b n h d -> b h n d")
- key = rearrange(key, "b s h d -> b h s d")
- value = rearrange(value, "b s h d -> b h s d")
-
- bq, hq, nq, dq = query.shape
- bk, hk, nk, dk = key.shape
- bv, hv, nv, dv = value.shape
- if not (bq == bk == bv and dq == dk == dv):
- raise ValueError(
- "Expected query, key, and value to have the same batch size (dim=0) and "
- f"embedding dimension (dim=3), but got query: {query.shape}, "
- f"key: {key.shape}, and value: {value.shape}."
- )
- elif (hk != hv) or (nk != nv):
- raise ValueError(
- "Expected key and value to have the same size in dimensions 1 and 2, but "
- f"got key: {key.shape} and value: {value.shape}."
- )
- elif hq % hk != 0:
- raise ValueError(
- "Expected query heads to be a multiple of key/value heads, but got "
- f"query: {query.shape} and key/value: {key.shape}."
- )
-
- if scale is None:
- scale = query.size(-1) ** 0.5
- query = query / scale
-
- num_head_groups = hq // hk
- query = rearrange(query, "b (h g) n d -> b g h n d", g=num_head_groups)
- similarity = einsum(query, key, "b g h n d, b h s d -> b g h n s")
-
- if is_causal:
- # Mask out the upper triangular portion of the attention matrix. This prevents
- # the model from attending to tokens in the future.
- mask = torch.ones((bq, nq, nk), device=query.device, dtype=torch.bool).tril_()
-
- if mask is not None:
- # Expand mask to match the shape of the attention matrix.
- # If mask is 2D, assume that it is applied to the key/value sequence dimension.
- # Else if mask is 3D, assume that it is applied to the query/key/value sequence
- # dimension for all attention heads.
- #
- if mask.ndim == 2:
- mask = rearrange(mask, "b s -> b () () () s")
- elif mask.ndim == 3:
- mask = rearrange(mask, "b n s -> b () () n s")
- # Mask similarity values by setting them to negative infinity. This guarantees
- # that they will not contribute to the softmax computation below.
- similarity.masked_fill_(~mask, torch.finfo(similarity.dtype).min)
-
- attention = F.softmax(similarity, dim=-1)
- if dropout > 0.0:
- attention = F.dropout(attention, p=dropout)
-
- # Apply attention matrix to the value Tensor.
- out = einsum(attention, value, "b g h n s, b h s d -> b g h n d")
- # Move head dimension back to axis 2
- out = rearrange(out, "b g h n d -> b n (h g) d")
-
- attn_weights: Optional[Tensor] = None
- if need_weights:
- # Move the sequence dimensions back to positions 1, 2. Move the head dimension
- # to position 3. This more closely matches the return shape of the attention
- # output: (b, n, h, d).
- attn_weights = rearrange(attention, "b g h n s -> b n s (h g)")
- if average_attn_weights:
- attn_weights = attn_weights.mean(dim=1)
-
- return out, attn_weights
class MultiheadAttention(nn.Module):
def __init__(
@@ -327,4 +207,4 @@ class ResidualCrossAttentionBlock(nn.Module):
def forward(self, x: torch.Tensor, data: torch.Tensor):
x = x + self.attn(self.ln_1(x), self.ln_2(data))
x = x + self.mlp(self.ln_3(x))
- return x
\ No newline at end of file
+ return x
diff --git a/craftsman/models/transformers/perceiver_1d.py b/craftsman/models/transformers/perceiver_1d.py
old mode 100644
new mode 100755
diff --git a/craftsman/models/transformers/utils.py b/craftsman/models/transformers/utils.py
old mode 100644
new mode 100755
diff --git a/craftsman/pipeline.py b/craftsman/pipeline.py
old mode 100644
new mode 100755
index 15edafd62330df4c7d3d282ab09bcd228a6353b4..a0d62fe0990761f59dc4e7f1fe79ddea4bf7c04d
--- a/craftsman/pipeline.py
+++ b/craftsman/pipeline.py
@@ -158,6 +158,7 @@ class CraftsManPipeline():
background_color: List[int] = [255, 255, 255],
foreground_ratio: float = 0.95,
mc_depth: int = 8,
+ only_max_component: bool = False,
):
r"""
Function invoked when calling the pipeline for generation.
@@ -198,6 +199,9 @@ class CraftsManPipeline():
mc_depth (`int`, *optional*, defaults to 8):
The resolution of the Marching Cubes algorithm. The resolution is the number of cubes in the x, y, and z.
8 means 2^8 = 256 cubes in each dimension. The higher the resolution, the more detailed the mesh will be.
+ only_max_component (`bool`, *optional*, defaults to `False`):
+ Whether to only keep the largest connected component of the mesh. This is useful when the mesh has
+ multiple components and only the largest one is needed.
Examples:
Returns:
@@ -258,6 +262,15 @@ class CraftsManPipeline():
if output_type == "trimesh":
import trimesh
cur_mesh = trimesh.Trimesh(vertices=mesh_v_f[0][0], faces=mesh_v_f[0][1])
+ if only_max_component:
+ components = cur_mesh.split(only_watertight=False)
+ bbox = []
+ for c in components:
+ bbmin = c.vertices.min(0)
+ bbmax = c.vertices.max(0)
+ bbox.append((bbmax - bbmin).max())
+ max_component = np.argmax(bbox)
+ cur_mesh = components[max_component]
mesh.append(cur_mesh)
elif output_type == "np":
mesh.append(mesh_v_f[0])
diff --git a/craftsman/systems/__pycache__/__init__.cpython-310.pyc b/craftsman/systems/__pycache__/__init__.cpython-310.pyc
index a33a8274329244348289bbfd7de12c4a6533a0d1..275a97fa604c9525eb8a9aeb5cc7f70394a6f088 100644
Binary files a/craftsman/systems/__pycache__/__init__.cpython-310.pyc and b/craftsman/systems/__pycache__/__init__.cpython-310.pyc differ
diff --git a/craftsman/systems/__pycache__/base.cpython-310.pyc b/craftsman/systems/__pycache__/base.cpython-310.pyc
index 368251a7d4d3b344031271b791f22c803dbea049..22227831b79934f817e5104d93855117bd837fe5 100644
Binary files a/craftsman/systems/__pycache__/base.cpython-310.pyc and b/craftsman/systems/__pycache__/base.cpython-310.pyc differ
diff --git a/craftsman/systems/__pycache__/pixart_diffusion.cpython-310.pyc b/craftsman/systems/__pycache__/pixart_diffusion.cpython-310.pyc
index 3dc2e3115a0738aa6d00e90c50b83cae66274e60..355b9b8ad90cef4e78e8a19d0489ed73aced96e0 100644
Binary files a/craftsman/systems/__pycache__/pixart_diffusion.cpython-310.pyc and b/craftsman/systems/__pycache__/pixart_diffusion.cpython-310.pyc differ
diff --git a/craftsman/systems/__pycache__/shape_autoencoder.cpython-310.pyc b/craftsman/systems/__pycache__/shape_autoencoder.cpython-310.pyc
index 12bf192a5b35186b6aa033695f0ea61c3272a554..71d14323b7666b642eb2b2a867484678f42eb5d4 100644
Binary files a/craftsman/systems/__pycache__/shape_autoencoder.cpython-310.pyc and b/craftsman/systems/__pycache__/shape_autoencoder.cpython-310.pyc differ
diff --git a/craftsman/systems/__pycache__/utils.cpython-310.pyc b/craftsman/systems/__pycache__/utils.cpython-310.pyc
old mode 100644
new mode 100755
diff --git a/craftsman/systems/pixart_diffusion.py b/craftsman/systems/pixart_diffusion.py
old mode 100644
new mode 100755
index 529167af2542f7d857defa39074e8047817b0b8c..93466315855a0b1b93d44e8977eeeda103f95fe5
--- a/craftsman/systems/pixart_diffusion.py
+++ b/craftsman/systems/pixart_diffusion.py
@@ -251,9 +251,9 @@ class PixArtDiffusionSystem(BaseSystem):
return {
"loss_diffusion": loss,
"latents": latents,
- "x_t": x_t,
+ "x_t": noisy_z,
"noise": noise,
- "noise_pred": pred_noise,
+ "noise_pred": noise_pred,
"timesteps": timesteps,
}
@@ -373,4 +373,4 @@ class PixArtDiffusionSystem(BaseSystem):
return outputs
def on_validation_epoch_end(self):
- pass
\ No newline at end of file
+ pass
diff --git a/craftsman/utils/__pycache__/__init__.cpython-310.pyc b/craftsman/utils/__pycache__/__init__.cpython-310.pyc
index b9c8c4a21e1ddb79aab68a042d8c41e41bae9ad4..dc8fc56aa251450661270183a09d2d2ef7ce4e3c 100644
Binary files a/craftsman/utils/__pycache__/__init__.cpython-310.pyc and b/craftsman/utils/__pycache__/__init__.cpython-310.pyc differ
diff --git a/craftsman/utils/__pycache__/__init__.cpython-311.pyc b/craftsman/utils/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d176e286f072ec0c415f0db0f8c81b0967893b89
Binary files /dev/null and b/craftsman/utils/__pycache__/__init__.cpython-311.pyc differ
diff --git a/craftsman/utils/__pycache__/base.cpython-310.pyc b/craftsman/utils/__pycache__/base.cpython-310.pyc
index 83a68795573ff8940846efa2c7d038d175d73b83..853925fb34f1b0e3cf043bcaaae367591f543ccd 100644
Binary files a/craftsman/utils/__pycache__/base.cpython-310.pyc and b/craftsman/utils/__pycache__/base.cpython-310.pyc differ
diff --git a/craftsman/utils/__pycache__/base.cpython-311.pyc b/craftsman/utils/__pycache__/base.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..83f500bde0064775718572f0abc99760debdb29f
Binary files /dev/null and b/craftsman/utils/__pycache__/base.cpython-311.pyc differ
diff --git a/craftsman/utils/__pycache__/checkpoint.cpython-310.pyc b/craftsman/utils/__pycache__/checkpoint.cpython-310.pyc
index bd5b481c93c34571f33c0fc48ef145ed23b1fa58..fed9e95b7e88f3a21a4f3816a7f2ae36b5a1b295 100644
Binary files a/craftsman/utils/__pycache__/checkpoint.cpython-310.pyc and b/craftsman/utils/__pycache__/checkpoint.cpython-310.pyc differ
diff --git a/craftsman/utils/__pycache__/config.cpython-310.pyc b/craftsman/utils/__pycache__/config.cpython-310.pyc
index aac3aef5468c180268a0c8c52a1707e60d870822..45270df949585d3d479f430499777ffa86b42f68 100644
Binary files a/craftsman/utils/__pycache__/config.cpython-310.pyc and b/craftsman/utils/__pycache__/config.cpython-310.pyc differ
diff --git a/craftsman/utils/__pycache__/config.cpython-311.pyc b/craftsman/utils/__pycache__/config.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f70d9dc896ed0327846e4140fc2261f359ad954c
Binary files /dev/null and b/craftsman/utils/__pycache__/config.cpython-311.pyc differ
diff --git a/craftsman/utils/__pycache__/misc.cpython-310.pyc b/craftsman/utils/__pycache__/misc.cpython-310.pyc
index 3adf88e303ffbe6e877b42cbadc7f326fcbad751..b6c11bf8560b424fcd6f45d5da9202ddcde69eae 100644
Binary files a/craftsman/utils/__pycache__/misc.cpython-310.pyc and b/craftsman/utils/__pycache__/misc.cpython-310.pyc differ
diff --git a/craftsman/utils/__pycache__/ops.cpython-310.pyc b/craftsman/utils/__pycache__/ops.cpython-310.pyc
index debfa41eae014018d9d0ba540248b2f77de3c23b..e8bdd120d0dc5c632d918740c6eeab3e0fb99f68 100644
Binary files a/craftsman/utils/__pycache__/ops.cpython-310.pyc and b/craftsman/utils/__pycache__/ops.cpython-310.pyc differ
diff --git a/craftsman/utils/__pycache__/saving.cpython-310.pyc b/craftsman/utils/__pycache__/saving.cpython-310.pyc
index 4b38d835c68dc5a67dad09beea97d2bb2f47fcf4..a8fa8a52d8589ce573de44a75b1ebe40fdf99c9f 100644
Binary files a/craftsman/utils/__pycache__/saving.cpython-310.pyc and b/craftsman/utils/__pycache__/saving.cpython-310.pyc differ
diff --git a/craftsman/utils/__pycache__/scheduler.cpython-310.pyc b/craftsman/utils/__pycache__/scheduler.cpython-310.pyc
index d7f9b4184fd39cf6b2df5076a97681f89ea577fe..7f65835c4ecd6f9233a169a24e5e14184a122ca4 100644
Binary files a/craftsman/utils/__pycache__/scheduler.cpython-310.pyc and b/craftsman/utils/__pycache__/scheduler.cpython-310.pyc differ
diff --git a/craftsman/utils/__pycache__/typing.cpython-310.pyc b/craftsman/utils/__pycache__/typing.cpython-310.pyc
index 24970b9142feaba6029154274a042c4c34fc6e92..79b8dae01c4afaf63968f2bb64582ba8204f29a1 100644
Binary files a/craftsman/utils/__pycache__/typing.cpython-310.pyc and b/craftsman/utils/__pycache__/typing.cpython-310.pyc differ
diff --git a/craftsman/utils/__pycache__/typing.cpython-311.pyc b/craftsman/utils/__pycache__/typing.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..448acbdfe51e93efe4f6efbc5fa514db87a3902c
Binary files /dev/null and b/craftsman/utils/__pycache__/typing.cpython-311.pyc differ
diff --git a/docker/requirements.txt b/docker/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..297f4a1b96ee50046dba8c079bdf3165c1f66728
--- /dev/null
+++ b/docker/requirements.txt
@@ -0,0 +1,34 @@
+datasets==2.19.0
+diffusers==0.31.0
+einops==0.8.0
+huggingface-hub==0.26.2
+imageio==2.34.1
+jaxtyping==0.2.28
+joblib==1.4.0
+lightning-utilities==0.11.2
+matplotlib==3.8.4
+numpy==1.26.4
+omegaconf==2.3.0
+opencv-python==4.9.0.80
+pandas==2.2.2
+pillow==10.3.0
+plyfile==1.0.3
+PyMCubes==0.1.4
+pyparsing==3.1.2
+pytorch-lightning==2.2.4
+PyYAML==6.0.1
+safetensors==0.4.3
+scikit-image==0.23.2
+scipy==1.13.0
+tensorboard==2.16.2
+tensorboardX==2.6.2.2
+timm==0.9.16
+tokenizers==0.19.1
+tqdm==4.66.2
+transformers==4.40.1
+trimesh==4.3.2
+spaces==0.28.3
+accelerate==0.29.1
+rembg==2.0.59
+gradio==5.5.0
+wandb==0.18.6
\ No newline at end of file
diff --git a/server.py b/server.py
new file mode 100644
index 0000000000000000000000000000000000000000..614a21e19baac9991c6a16871cb0cb7f340797eb
--- /dev/null
+++ b/server.py
@@ -0,0 +1,98 @@
+import argparse
+import base64
+import os
+from datetime import datetime
+import traceback
+import torch
+import trimesh
+from craftsman import CraftsManPipeline
+
+CURRENT_DIR = f'/tmp/native3d_server/{os.getpid()}'
+os.makedirs(CURRENT_DIR, exist_ok=True)
+
+def parse_parameters():
+ parser = argparse.ArgumentParser("native3d")
+ parser.add_argument('--host', default="0.0.0.0", type=str)
+ parser.add_argument('--port', default=80, type=int)
+ return parser.parse_args()
+
+# -------------------- fastapi --------------------
+from typing import Optional
+from pydantic import BaseModel, Field
+
+class Native3DRequestV1(BaseModel):
+ image_path: str # input image path
+ mesh_path: str # output mesh path, support glb or obj in clean dir
+
+class Native3DResponseV1(BaseModel):
+ pass
+
+class Native3DRequestV2(BaseModel):
+ image_bytes: str # input image bytes(base64)
+ mesh_type: str # output mesh type, support glb or obj
+
+class Native3DResponseV2(BaseModel):
+ mesh_bytes: str # output mesh bytes(base64)
+
+if __name__=="__main__":
+ parse_args = parse_parameters()
+
+ # prepare models
+ pipeline = CraftsManPipeline.from_pretrained("ckpts/craftsman-v1-5", device="cuda:0", torch_dtype=torch.float32)
+
+ # -------------------- fastapi --------------------
+ from fastapi import FastAPI, Request
+ import requests
+ app = FastAPI()
+
+ @app.post("/native3d_v1", response_model=Native3DResponseV1)
+ async def native3d(request: Request, image_to_mesh_request: Native3DRequestV1):
+ try:
+ print(f"image_to_mesh_request = {image_to_mesh_request}")
+ mesh = pipeline(image_to_mesh_request.image_path).meshes[0]
+ os.makedirs(os.path.dirname(os.path.abspath(image_to_mesh_request.mesh_path)), exist_ok=True)
+ mesh.export(image_to_mesh_request.mesh_path)
+ except Exception as e:
+ traceback.print_exc()
+ print(f"generate_model error: {e}")
+ return Native3DResponseV1()
+
+ @app.post("/native3d_v2", response_model=Native3DResponseV2)
+ async def native3d(request: Request, image_to_mesh_request: Native3DRequestV2):
+ try:
+ # print(f"image_to_mesh_request = {image_to_mesh_request}")
+ mesh_type = image_to_mesh_request.mesh_type
+ assert mesh_type in ['obj', 'glb']
+ task_id = datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f') + '-' + 'native3d'
+ current_dir = os.path.join(CURRENT_DIR, task_id)
+ os.makedirs(current_dir, exist_ok=True)
+ image_path = os.path.join(current_dir, 'input_image.png')
+ with open(image_path, 'wb') as f:
+ f.write(base64.b64decode(image_to_mesh_request.image_bytes))
+ mesh_path = os.path.join(current_dir, f'output_mesh.{mesh_type}')
+ import time
+ start = time.time()
+ # mesh = pipeline(image_path, mc_depth=8, num_inference_steps=25).meshes[0]
+ # mesh = pipeline(image_path, mc_depth=7, num_inference_steps=25).meshes[0]
+ mesh = pipeline(image_path, mc_depth=7, num_inference_steps=50).meshes[0]
+ print(f"Time: {time.time() - start}s")
+ os.makedirs(os.path.dirname(os.path.abspath(mesh_path)), exist_ok=True)
+ mesh.visual = trimesh.visual.TextureVisuals(
+ material=trimesh.visual.material.PBRMaterial(
+ baseColorFactor=(255, 255, 255), main_color=(255, 255, 255), metallicFactor=0.05, roughnessFactor=1.0
+ )
+ )
+ mesh.export(mesh_path)
+ with open(mesh_path, 'rb') as f:
+ mesh_bytes = f.read()
+ except Exception as e:
+ traceback.print_exc()
+ print(f"generate_model error: {e}")
+ return Native3DResponseV2(mesh_bytes=base64.b64encode(mesh_bytes).decode('utf-8'))
+
+ @app.get("/health")
+ async def health():
+ return {"status": "OK"}
+
+ import uvicorn
+ uvicorn.run(app, host=parse_args.host, port=parse_args.port)