diff --git a/ckpts/.gitattributes b/ckpts/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/ckpts/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/ckpts/README.md b/ckpts/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8cf968a395bf8fa4fb7f36da5c429a0560165bfe --- /dev/null +++ b/ckpts/README.md @@ -0,0 +1,117 @@ +--- +library_name: craftsman-v1-5 +license: creativeml-openrail-m +license_name: creativeml-openrail-m +license_link: https://raw.githubusercontent.com/CompVis/stable-diffusion/refs/heads/main/LICENSE +pipeline_tag: image-to-3d +language: +- en +- zh +--- + +## **CraftsMan-v1-5** + +

+ +

+ +###
CraftsMan: High-fidelity Mesh Generation
with 3D Native Generation and Interactive Geometry Refiner
+#####

[Weiyu Li*1,2](https://wyysf-98.github.io/), Jiarui Liu*1,2, Hongyu Yan*1,2, [Rui Chen1,2](https://aruichen.github.io/), [Yixun Liang2,3](https://yixunliang.github.io/), [Xuelin Chen4](https://xuelin-chen.github.io/), [Ping Tan1,2](https://ece.hkust.edu.hk/pingtan), [Xiaoxiao Long1,2](https://www.xxlong.site/)

+#####

1HKUST, 2LightIllusions, 3HKUST(GZ), 4Tencent AI Lab

+
+   +   +   +
+ +# Usage + +To use the model, please refer to the [official repository](https://github.com/wyysf-98/CraftsMan) for installation and usage instructions. + +``` + +from craftsman import CraftsManPipeline +import torch + +pipeline = CraftsManPipeline.from_pretrained("./ckpts/craftsman-v1-5", device="cuda:0", torch_dtype=torch.float32) # load from local ckpt +mesh = pipeline("https://pub-f9073a756ec645d692ce3d171c2e1232.r2.dev/data/werewolf.png").meshes[0] +mesh.export("werewolf.obj") + +``` + +## 🔥🔥🔥 News!! + +* Nov 16, 2024: 💬 We release the CraftsMan-v1-5 + + +## 📑 Open-source Plan + +- [x] Inference +- [x] Checkpoints +- [x] Training +- [ ] ComfyUI + +## 🎉 **CraftMan-v1-5 Architecture** + +

+ +

+ + +## Get Started + +#### Begin by cloning the repository: + +```shell +git clone https://github.com/wyysf-98/CraftsMan +cd CraftsMan +``` + +#### Installation Guide for Linux + +We provide an env_install.sh script file for setting up environment. + +``` +# step 1, create conda env +conda create -n CraftsMan python=3.10 +conda activate CraftsMan + + +# step 2. install torch realated package +conda install -c pytorch pytorch=2.3.0 torchvision=0.18.0 cudatoolkit=11.8 + +# step 3. install other packages +pip install -r docker/requirements.txt +``` +
+ + +#### Using Gradio + +We have prepared a gradio demo for you to try out the model. You can run the following command to start the demo. + +```shell +# std +python3 gradio.py +``` + +Then the demo can be accessed through the output link. + + +## Citation + +If you found this repository helpful, please cite our report: +```bibtex +@misc{li2024craftsman, +title = {CraftsMan: High-fidelity Mesh Generation with 3D Native Generation and Interactive Geometry Refiner}, +author = {Weiyu Li and Jiarui Liu and Rui Chen and Yixun Liang and Xuelin Chen and Ping Tan and Xiaoxiao Long}, +year = {2024}, +archivePrefix = {arXiv preprint arXiv:2405.14979}, +primaryClass = {cs.CG} +} +``` + + +# License + +[creativeml-openrail-m](https://raw.githubusercontent.com/CompVis/stable-diffusion/refs/heads/main/LICENSE) \ No newline at end of file diff --git a/ckpts/craftsman-v1-5 b/ckpts/craftsman-v1-5 new file mode 160000 index 0000000000000000000000000000000000000000..9a5e9189c2dfab20cf838885dd6acaf99b41844e --- /dev/null +++ b/ckpts/craftsman-v1-5 @@ -0,0 +1 @@ +Subproject commit 9a5e9189c2dfab20cf838885dd6acaf99b41844e diff --git a/craftsman/__pycache__/__init__.cpython-310.pyc b/craftsman/__pycache__/__init__.cpython-310.pyc index 5a51ef6f8fe535a1397913052837454aaa7d53f7..a8785076ecfc8e97f756e760ae2bb5095879ffa7 100644 Binary files a/craftsman/__pycache__/__init__.cpython-310.pyc and b/craftsman/__pycache__/__init__.cpython-310.pyc differ diff --git a/craftsman/__pycache__/__init__.cpython-311.pyc b/craftsman/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fc69ac74bbcfa36fde97eb2762a774a67d228bc5 Binary files /dev/null and b/craftsman/__pycache__/__init__.cpython-311.pyc differ diff --git a/craftsman/__pycache__/pipeline.cpython-310.pyc b/craftsman/__pycache__/pipeline.cpython-310.pyc index 11f8f99c630037594255abc91320ca76286dcedb..e4fdf0d89b5c80301e20cb0b7a894bf4331efa14 100644 Binary files a/craftsman/__pycache__/pipeline.cpython-310.pyc and b/craftsman/__pycache__/pipeline.cpython-310.pyc differ diff --git a/craftsman/__pycache__/pipeline.cpython-311.pyc b/craftsman/__pycache__/pipeline.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..80f55005a19035f7b9fa5f929d493aa8e071f7d6 Binary files /dev/null and b/craftsman/__pycache__/pipeline.cpython-311.pyc differ diff --git a/craftsman/data/__pycache__/Objaverse.cpython-310.pyc b/craftsman/data/__pycache__/Objaverse.cpython-310.pyc index e386501d4e556a0b58284856146c0cc902fe116c..5a3e4cd7808df822b968a71f2d68a14dc29933d7 100644 Binary files a/craftsman/data/__pycache__/Objaverse.cpython-310.pyc and b/craftsman/data/__pycache__/Objaverse.cpython-310.pyc differ diff --git a/craftsman/data/__pycache__/__init__.cpython-310.pyc b/craftsman/data/__pycache__/__init__.cpython-310.pyc index a3984e5da4f87d7c9daa51190557b2d1b52905ae..0337eaaf4af9d87488de122e627d45f4ae88ecef 100644 Binary files a/craftsman/data/__pycache__/__init__.cpython-310.pyc and b/craftsman/data/__pycache__/__init__.cpython-310.pyc differ diff --git a/craftsman/data/__pycache__/base.cpython-310.pyc b/craftsman/data/__pycache__/base.cpython-310.pyc index 598c77443204d9b4fedab25daee82170de9c59e0..e04ec395e61d3acc020c5ad560f1e7594055799c 100644 Binary files a/craftsman/data/__pycache__/base.cpython-310.pyc and b/craftsman/data/__pycache__/base.cpython-310.pyc differ diff --git a/craftsman/data/base.py b/craftsman/data/base.py index a89fcbf466c1af978e498fb383eb52df10bc8d82..8b530cc41c75f9beab62c01760537ea3c8a60f9d 100755 --- a/craftsman/data/base.py +++ b/craftsman/data/base.py @@ -53,7 +53,7 @@ class BaseDataModuleConfig: # for occupancy and sdf data n_samples: int = 4096 # number of points in input point cloud upsample_ratio: int = 1 # upsample ratio for input point cloud - sampling_strategy: str = "random" # sampling strategy for input point cloud + sampling_strategy: Optional[str] = None # sampling strategy for input point cloud scale: float = 1.0 # scale of the input point cloud and target supervision load_supervision: bool = True # whether to load supervision supervision_type: str = "occupancy" # occupancy, sdf, tsdf @@ -70,6 +70,8 @@ class BaseDataModuleConfig: idx: Optional[List[int]] = None # index of the image to load n_views: int = 1 # number of views marign_pix_dis: int = 30 # margin of the bounding box + batch_size: int = 32 + num_workers: int = 8 class BaseDataset(Dataset): @@ -78,7 +80,7 @@ class BaseDataset(Dataset): self.cfg: BaseDataModuleConfig = cfg self.split = split - self.uids = json.load(open(f'{cfg.root_dir}/{split}.json')) + self.uids = json.load(open(f'{cfg.local_dir}/{split}.json')) print(f"Loaded {len(self.uids)} {split} uids") def __len__(self): @@ -94,10 +96,7 @@ class BaseDataset(Dataset): surface = np.concatenate([surface, normal], axis=1) elif self.cfg.geo_data_type == "sdf": # for sdf data with our own format - if re.match(r"\.\.", self.uids[index]): - data = np.load(f'{self.cfg.geo_data_path}/{self.uids[index]}.npz') - else: - data = np.load(f'{self.uids[index]}.npz') + data = np.load(f'{self.cfg.geo_data_path}/{self.uids[index]}.npz') # for input point cloud surface = data["surface"] else: @@ -112,6 +111,8 @@ class BaseDataset(Dataset): import fpsample kdline_fps_samples_idx = fpsample.bucket_fps_kdline_sampling(surface[:, :3], self.cfg.n_samples, h=5) surface = surface[kdline_fps_samples_idx] + elif self.cfg.sampling_strategy is None: + pass else: raise NotImplementedError(f"sampling strategy {self.cfg.sampling_strategy} not implemented") # rescale data @@ -189,9 +190,9 @@ class BaseDataset(Dataset): sel_idx = random.choice(self.cfg.idx) ret["sel_image_idx"] = sel_idx if self.cfg.image_type == "rgb": - img_path = f'{self.cfg.image_data_path}/' + "/".join(self.uids[index].split('/')[-2:]) + f"/{'{:04d}'.format(sel_idx)}_rgb.png" + img_path = f'{self.cfg.image_data_path}/' + "/".join(self.uids[index].split('/')[-2:]) + f"/{'{:04d}'.format(sel_idx)}_rgb.jpeg" elif self.cfg.image_type == "normal": - img_path = f'{self.cfg.image_data_path}/' + "/".join(self.uids[index].split('/')[-2:]) + f"/{'{:04d}'.format(sel_idx)}_normal.png" + img_path = f'{self.cfg.image_data_path}/' + "/".join(self.uids[index].split('/')[-2:]) + f"/{'{:04d}'.format(sel_idx)}_normal.jpeg" ret["image"], ret["mask"] = _load_single_image(img_path, background_color, self.cfg.marign_pix_dis) else: diff --git a/craftsman/models/__pycache__/__init__.cpython-310.pyc b/craftsman/models/__pycache__/__init__.cpython-310.pyc index 6f2a4aa9900ea2774cdf420f186133bc7841c02a..11adffc179d52bd2723fda25a7c93449622bc2cb 100644 Binary files a/craftsman/models/__pycache__/__init__.cpython-310.pyc and b/craftsman/models/__pycache__/__init__.cpython-310.pyc differ diff --git a/craftsman/models/autoencoders/__pycache__/__init__.cpython-310.pyc b/craftsman/models/autoencoders/__pycache__/__init__.cpython-310.pyc index 816e8317dfc4c21814e583abccec00f656238ef0..4dc405b92470faf12abfefebf27d6639c0c42d40 100644 Binary files a/craftsman/models/autoencoders/__pycache__/__init__.cpython-310.pyc and b/craftsman/models/autoencoders/__pycache__/__init__.cpython-310.pyc differ diff --git a/craftsman/models/autoencoders/__pycache__/michelangelo_autoencoder.cpython-310.pyc b/craftsman/models/autoencoders/__pycache__/michelangelo_autoencoder.cpython-310.pyc index 6dc2375a7d7d199b2d62a491b5a98b85c9587634..842c373c10f5f0f6ec7f42e2fbde457bd549bda2 100644 Binary files a/craftsman/models/autoencoders/__pycache__/michelangelo_autoencoder.cpython-310.pyc and b/craftsman/models/autoencoders/__pycache__/michelangelo_autoencoder.cpython-310.pyc differ diff --git a/craftsman/models/conditional_encoders/__pycache__/__init__.cpython-310.pyc b/craftsman/models/conditional_encoders/__pycache__/__init__.cpython-310.pyc index 5cdb9960006c8152e0087a74620d445905ca85c4..a95057bd6e6ca2a1ba21b989d773dd4832be6e79 100644 Binary files a/craftsman/models/conditional_encoders/__pycache__/__init__.cpython-310.pyc and b/craftsman/models/conditional_encoders/__pycache__/__init__.cpython-310.pyc differ diff --git a/craftsman/models/conditional_encoders/__pycache__/base.cpython-310.pyc b/craftsman/models/conditional_encoders/__pycache__/base.cpython-310.pyc index 1bd6963909f1f914109d6babaea136f7b82e87f9..1ca2aefda5cd18199e0848691a3271dba7fac3eb 100644 Binary files a/craftsman/models/conditional_encoders/__pycache__/base.cpython-310.pyc and b/craftsman/models/conditional_encoders/__pycache__/base.cpython-310.pyc differ diff --git a/craftsman/models/conditional_encoders/__pycache__/cond_encoder.cpython-310.pyc b/craftsman/models/conditional_encoders/__pycache__/cond_encoder.cpython-310.pyc index 1aa092ceb6e5e4d1b03c3ad1c906ba5a44f89763..45bca7ab683148d63f794b2f29a550371805ee1e 100644 Binary files a/craftsman/models/conditional_encoders/__pycache__/cond_encoder.cpython-310.pyc and b/craftsman/models/conditional_encoders/__pycache__/cond_encoder.cpython-310.pyc differ diff --git a/craftsman/models/conditional_encoders/clip/__pycache__/modeling_clip.cpython-310.pyc b/craftsman/models/conditional_encoders/clip/__pycache__/modeling_clip.cpython-310.pyc index 9b2783cc2194ea5717c0037b3b33547d75372724..45e0f8eff2e57853db37a7bbacb4bfbc213d03a2 100644 Binary files a/craftsman/models/conditional_encoders/clip/__pycache__/modeling_clip.cpython-310.pyc and b/craftsman/models/conditional_encoders/clip/__pycache__/modeling_clip.cpython-310.pyc differ diff --git a/craftsman/models/conditional_encoders/clip/__pycache__/modeling_conditional_clip.cpython-310.pyc b/craftsman/models/conditional_encoders/clip/__pycache__/modeling_conditional_clip.cpython-310.pyc index 4f0517d090506cdf564f4e2c281cc1f4e7db2667..982cdf8a3dffaba1863b7eb59dcfe55923e82a06 100644 Binary files a/craftsman/models/conditional_encoders/clip/__pycache__/modeling_conditional_clip.cpython-310.pyc and b/craftsman/models/conditional_encoders/clip/__pycache__/modeling_conditional_clip.cpython-310.pyc differ diff --git a/craftsman/models/conditional_encoders/cond_encoder.py b/craftsman/models/conditional_encoders/cond_encoder.py old mode 100644 new mode 100755 index 1faa5eb44fc421e84af7e8f5c54a4c4389396464..85c75e249bcda50d33c2dff9ea57c4f5b3ff3136 --- a/craftsman/models/conditional_encoders/cond_encoder.py +++ b/craftsman/models/conditional_encoders/cond_encoder.py @@ -46,7 +46,6 @@ class CondEmbedder(BaseEmbedder): enable_gradient_checkpointing: bool = False embeds_fusion_mode: int = 1 # 0: sum | 1: concat linear_proj_init: str = "constant" - text_model_type: str = "clip" text_max_length: int = 77 image_size_clip: int = 224 image_size_dino: int = 224 @@ -277,29 +276,9 @@ class CondEmbedder(BaseEmbedder): else: return vision_outputs.last_hidden_state - def post_process_embeds(self, text_embeds, visual_embeds): - clip_embeds, dino_embeds = visual_embeds.chunk(2, dim=2) - if self.cfg.normalize_embeds: - # post-process the text/visual embeds - if text_embeds is not None: - text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True) - if clip_embeds is not None: - clip_embeds = clip_embeds / clip_embeds.norm(dim=-1, keepdim=True) - if dino_embeds is not None: - dino_embeds = dino_embeds / dino_embeds.norm(dim=-1, keepdim=True) - - assert text_embeds is not None or dino_embeds is not None or clip_embeds is not None - - if text_embeds is not None and visual_embeds is not None: - return torch.cat([text_embeds, visual_embeds], dim=1) - elif text_embeds is not None: - return text_embeds - else: - return visual_embeds - def encode_image(self, images: Iterable[Optional[ImageType]], cameras: Optional[torch.Tensor] = None, force_none_camera_embeds: bool = False, return_dict: bool = False, **kwargs) -> torch.FloatTensor: clip_embeds = self.encode_image_clip(images, cameras) dino_embeds = self.encode_image_dino(images, cameras) dino_embeds = self.linear_proj(dino_embeds) visual_embeds = torch.cat([clip_embeds, dino_embeds], dim=1) - return visual_embeds \ No newline at end of file + return visual_embeds diff --git a/craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_conditional_dinov2.cpython-310.pyc b/craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_conditional_dinov2.cpython-310.pyc index a8cc31eaf7786f8da942a578c437e98187aead95..e5febc512824e0aae8e91be9763a93fc54b69ddd 100644 Binary files a/craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_conditional_dinov2.cpython-310.pyc and b/craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_conditional_dinov2.cpython-310.pyc differ diff --git a/craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_dinov2.cpython-310.pyc b/craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_dinov2.cpython-310.pyc index ddafb045cd5591010924c1b676245338418afee4..2e015e8893c63efecead9dd44e7710d08de78ef5 100644 Binary files a/craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_dinov2.cpython-310.pyc and b/craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_dinov2.cpython-310.pyc differ diff --git a/craftsman/models/denoisers/__pycache__/__init__.cpython-310.pyc b/craftsman/models/denoisers/__pycache__/__init__.cpython-310.pyc index 7403ff05f007ce3dcd68a6b68853f6d9e2b9c765..821a33cc4bcdcbcd3b0bb42e5108209fdd5f10e5 100644 Binary files a/craftsman/models/denoisers/__pycache__/__init__.cpython-310.pyc and b/craftsman/models/denoisers/__pycache__/__init__.cpython-310.pyc differ diff --git a/craftsman/models/denoisers/__pycache__/pixart_denoiser.cpython-310.pyc b/craftsman/models/denoisers/__pycache__/pixart_denoiser.cpython-310.pyc index 64aab3f6a63158b80efed5c4714f60ef5cbd37c6..19b1973d0fdb7f557a328cea64758115cbee37a1 100644 Binary files a/craftsman/models/denoisers/__pycache__/pixart_denoiser.cpython-310.pyc and b/craftsman/models/denoisers/__pycache__/pixart_denoiser.cpython-310.pyc differ diff --git a/craftsman/models/denoisers/__pycache__/utils.cpython-310.pyc b/craftsman/models/denoisers/__pycache__/utils.cpython-310.pyc index 8a5b02a2bf10278ba7fc10ac184473d9712092ae..59f9f8adaf4e3ea92c8ef9a2bb65a733b26e9e33 100644 Binary files a/craftsman/models/denoisers/__pycache__/utils.cpython-310.pyc and b/craftsman/models/denoisers/__pycache__/utils.cpython-310.pyc differ diff --git a/craftsman/models/denoisers/pixart_denoiser.py b/craftsman/models/denoisers/pixart_denoiser.py index b34f1caa2d4bf6d8385106f619dcad6314cb7281..4c64be12d1e680a08f1917d3a8d1f6c2c93dc1f4 100755 --- a/craftsman/models/denoisers/pixart_denoiser.py +++ b/craftsman/models/denoisers/pixart_denoiser.py @@ -25,15 +25,11 @@ class PixArtDinoDenoiser(BaseModule): context_dim: int = 1024 n_views: int = 1 context_ln: bool = True - skip_ln: bool = False init_scale: float = 0.25 use_checkpoint: bool = False drop_path: float = 0. - variance_type: str = "" - img_pos_embed: bool = False clip_weight: float = 1.0 dino_weight: float = 1.0 - dit_block: str = "" cfg: Config @@ -63,9 +59,8 @@ class PixArtDinoDenoiser(BaseModule): init_scale = self.cfg.init_scale * math.sqrt(1.0 / self.cfg.width) drop_path = [x.item() for x in torch.linspace(0, self.cfg.drop_path, self.cfg.layers)] - ditblock = getattr(importlib.import_module("craftsman.models.denoisers.utils"), self.cfg.dit_block) self.blocks = nn.ModuleList([ - ditblock( + DiTBlock( width=self.cfg.width, heads=self.cfg.heads, init_scale=init_scale, @@ -82,11 +77,7 @@ class PixArtDinoDenoiser(BaseModule): ) # final layer - if self.cfg.variance_type.upper() in ["LEARNED", "LEARNED_RANGE"]: - self.output_channels = self.cfg.output_channels * 2 - else: - self.output_channels = self.cfg.output_channels - self.final_layer = T2IFinalLayer(self.cfg.width, self.output_channels) + self.final_layer = T2IFinalLayer(self.cfg.width, self.cfg.output_channels) self.identity_initialize() @@ -99,17 +90,6 @@ class PixArtDinoDenoiser(BaseModule): self.denoiser_ckpt[k.replace('denoiser_model.', '')] = v self.load_state_dict(self.denoiser_ckpt, strict=False) - def forward_with_dpmsolver(self, model_input, timestep, context): - """ - dpm solver donnot need variance prediction - """ - # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb - model_out = self.forward(model_input, timestep, context) - if self.cfg.variance_type.upper() in ["LEARNED", "LEARNED_RANGE"]: - return model_out.chunk(2, dim=-1)[0] - else: - return model_out - def identity_initialize(self): for block in self.blocks: nn.init.constant_(block.attn.c_proj.weight, 0) diff --git a/craftsman/models/denoisers/utils.py b/craftsman/models/denoisers/utils.py old mode 100644 new mode 100755 index 9e29cbbf784d8cfa0683eda7b7e72b0cf68356ce..ab03bbe930cff27fab9c45ffbf25220551e024cc --- a/craftsman/models/denoisers/utils.py +++ b/craftsman/models/denoisers/utils.py @@ -10,126 +10,6 @@ from timm.models.layers import DropPath from craftsman.models.transformers.utils import MLP from craftsman.models.transformers.attention import MultiheadAttention, MultiheadCrossAttention -class PatchEmbed(nn.Module): - """ 2D Image to Patch Embedding - """ - def __init__( - self, - patch_size=16, - in_chans=3, - embed_dim=768, - norm_layer=None, - flatten=True, - bias=True, - ): - super().__init__() - patch_size = to_2tuple(patch_size) - self.patch_size = patch_size - self.flatten = flatten - self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias) - self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() - - def forward(self, x): - x = self.proj(x) - if self.flatten: - x = x.flatten(2).transpose(1, 2) # BCHW -> BNC - x = self.norm(x) - return x - -class DiTBlock(nn.Module): - """ - A PixArt block with adaptive layer norm (adaLN-single) conditioning. - """ - - def __init__(self, width, heads, init_scale=1.0, qkv_bias=True, use_flash=True, drop_path=0.0): - super().__init__() - self.norm1 = nn.LayerNorm(width, elementwise_affine=True, eps=1e-6) - self.attn = MultiheadAttention( - n_ctx=None, - width=width, - heads=heads, - init_scale=init_scale, - qkv_bias=qkv_bias, - use_flash=use_flash - ) - self.cross_attn = MultiheadCrossAttention( - n_data=None, - width=width, - heads=heads, - data_width=None, - init_scale=init_scale, - qkv_bias=qkv_bias, - use_flash=use_flash, - ) - - self.norm2 = nn.LayerNorm(width, elementwise_affine=True, eps=1e-6) - - self.mlp = MLP(width=width, init_scale=init_scale) - self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() - self.scale_shift_table = nn.Parameter(torch.randn(6, width) / width ** 0.5) - - def forward(self, x, visual_cond, t, **kwargs): - B, N, C = x.shape - - shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.scale_shift_table[None] + t.reshape(B, 6, -1)).chunk(6, dim=1) - x = x + self.drop_path(gate_msa * self.attn(t2i_modulate(self.norm1(x), shift_msa, scale_msa)).reshape(B, N, C)) - x = x + self.cross_attn(x, visual_cond) - x = x + self.drop_path(gate_mlp * self.mlp(t2i_modulate(self.norm2(x), shift_mlp, scale_mlp))) - - return x - -class DiTBlock_text(nn.Module): - """ - A PixArt block with adaptive layer norm (adaLN-single) conditioning. - """ - - def __init__(self, width, heads, init_scale=1.0, qkv_bias=True, use_flash=True, drop_path=0.0): - super().__init__() - self.norm1 = nn.LayerNorm(width, elementwise_affine=True, eps=1e-6) - self.attn = MultiheadAttention( - n_ctx=None, - width=width, - heads=heads, - init_scale=init_scale, - qkv_bias=qkv_bias, - use_flash=use_flash - ) - self.cross_attn = MultiheadCrossAttention( - n_data=None, - width=width, - heads=heads, - data_width=None, - init_scale=init_scale, - qkv_bias=qkv_bias, - use_flash=use_flash, - ) - - self.cross_attn_extra = MultiheadCrossAttention( - n_data=None, - width=width, - heads=heads, - data_width=None, - init_scale=init_scale, - qkv_bias=qkv_bias, - use_flash=use_flash, - ) - self.norm2 = nn.LayerNorm(width, elementwise_affine=True, eps=1e-6) - - self.mlp = MLP(width=width, init_scale=init_scale) - self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() - self.scale_shift_table = nn.Parameter(torch.randn(6, width) / width ** 0.5) - - def forward(self, x, visual_cond, text_cond, t, **kwargs): - B, N, C = x.shape - - shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.scale_shift_table[None] + t.reshape(B, 6, -1)).chunk(6, dim=1) - x = x + self.drop_path(gate_msa * self.attn(t2i_modulate(self.norm1(x), shift_msa, scale_msa)).reshape(B, N, C)) - x = x + self.cross_attn(x, visual_cond) - x = x + self.cross_attn_extra(x, text_cond) - x = x + self.drop_path(gate_mlp * self.mlp(t2i_modulate(self.norm2(x), shift_mlp, scale_mlp))) - - return x - class DiTBlock(nn.Module): """ A DiT block with adaptive layer norm (adaLN-single) conditioning. @@ -174,11 +54,6 @@ class DiTBlock(nn.Module): def t2i_modulate(x, shift, scale): return x * (1 + scale) + shift -# def t2i_modulate(x, shift, scale): -# a = torch.ones_like(scale) -# a[..., 768:] = 0 -# return x * (a + scale) + shift - def auto_grad_checkpoint(module, *args, **kwargs): if getattr(module, 'grad_checkpointing', False): if not isinstance(module, Iterable): @@ -268,63 +143,4 @@ class T2IFinalLayer(nn.Module): shift, scale = (self.scale_shift_table[None] + t[:, None]).chunk(2, dim=1) x = t2i_modulate(self.norm_final(x), shift, scale) x = self.linear(x) - return x - -def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): - """ - embed_dim: output dimension for each position - pos: a list of positions to be encoded: size (M,) - out: (M, D) - """ - assert embed_dim % 2 == 0 - omega = np.arange(embed_dim // 2, dtype=np.float64) - omega /= embed_dim / 2. - omega = 1. / 10000 ** omega # (D/2,) - - pos = pos.reshape(-1) # (M,) - out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product - - emb_sin = np.sin(out) # (M, D/2) - emb_cos = np.cos(out) # (M, D/2) - - emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) - return emb - -def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): - assert embed_dim % 2 == 0 - - # use half of dimensions to encode grid_h - emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) - emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) - - emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) - return emb - -def _ntuple(n): - def parse(x): - if isinstance(x, Iterable) and not isinstance(x, str): - return x - return tuple(repeat(x, n)) - return parse - -to_1tuple = _ntuple(1) -to_2tuple = _ntuple(2) - -def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0, pe_interpolation=1.0, base_size=16): - """ - grid_size: int of the grid height and width - return: - pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) - """ - if isinstance(grid_size, int): - grid_size = to_2tuple(grid_size) - grid_h = np.arange(grid_size[0], dtype=np.float32) / (grid_size[0]/base_size) / pe_interpolation - grid_w = np.arange(grid_size[1], dtype=np.float32) / (grid_size[1]/base_size) / pe_interpolation - grid = np.meshgrid(grid_w, grid_h) # here w goes first - grid = np.stack(grid, axis=0) - grid = grid.reshape([2, 1, grid_size[1], grid_size[0]]) - - pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) - if cls_token and extra_tokens > 0: - pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0) - return pos_embed \ No newline at end of file + return x \ No newline at end of file diff --git a/craftsman/models/geometry/__pycache__/__init__.cpython-310.pyc b/craftsman/models/geometry/__pycache__/__init__.cpython-310.pyc index 5e8a39cd825569e0bbe4ba22f7241f6cbedc7069..97ec08e72b9bc4b93582d20c4790036fd0dd42ce 100644 Binary files a/craftsman/models/geometry/__pycache__/__init__.cpython-310.pyc and b/craftsman/models/geometry/__pycache__/__init__.cpython-310.pyc differ diff --git a/craftsman/models/geometry/__pycache__/base.cpython-310.pyc b/craftsman/models/geometry/__pycache__/base.cpython-310.pyc index 54b91b362964b3f4242d48775f76166c08e6f05f..001496e810e820ce5e93dcbc470a3e43cacda8f6 100644 Binary files a/craftsman/models/geometry/__pycache__/base.cpython-310.pyc and b/craftsman/models/geometry/__pycache__/base.cpython-310.pyc differ diff --git a/craftsman/models/geometry/__pycache__/utils.cpython-310.pyc b/craftsman/models/geometry/__pycache__/utils.cpython-310.pyc index 759c2dac939bd6029335df19094e417576e369e9..74b1e26189f3c644f9a597d481406a61c116895e 100644 Binary files a/craftsman/models/geometry/__pycache__/utils.cpython-310.pyc and b/craftsman/models/geometry/__pycache__/utils.cpython-310.pyc differ diff --git a/craftsman/models/transformers/__pycache__/attention.cpython-310.pyc b/craftsman/models/transformers/__pycache__/attention.cpython-310.pyc index 86aaa2ab10591ad95b7931524ed4ced417f7c8ac..a895397929ea800378c5387035a121236bf9567a 100644 Binary files a/craftsman/models/transformers/__pycache__/attention.cpython-310.pyc and b/craftsman/models/transformers/__pycache__/attention.cpython-310.pyc differ diff --git a/craftsman/models/transformers/__pycache__/perceiver_1d.cpython-310.pyc b/craftsman/models/transformers/__pycache__/perceiver_1d.cpython-310.pyc index 7b0fc923ea0ee99e65f6e38b6a3e98df4ba5dd08..06bc047d40e34f7868fbec57464ff80b5030db62 100644 Binary files a/craftsman/models/transformers/__pycache__/perceiver_1d.cpython-310.pyc and b/craftsman/models/transformers/__pycache__/perceiver_1d.cpython-310.pyc differ diff --git a/craftsman/models/transformers/__pycache__/utils.cpython-310.pyc b/craftsman/models/transformers/__pycache__/utils.cpython-310.pyc index ba2de6a0419f045de38bb33d2f42ef89a29a6fa1..f21f6acb7b0ac748ee2914a5cce09bef3228072a 100644 Binary files a/craftsman/models/transformers/__pycache__/utils.cpython-310.pyc and b/craftsman/models/transformers/__pycache__/utils.cpython-310.pyc differ diff --git a/craftsman/models/transformers/attention.py b/craftsman/models/transformers/attention.py old mode 100644 new mode 100755 index 81d179cf4e070c7a0520567b5e64dd3d7ed03792..e4623355b6d1d4d3d2c5c8827d23f8ecf4f0b0c6 --- a/craftsman/models/transformers/attention.py +++ b/craftsman/models/transformers/attention.py @@ -9,126 +9,6 @@ from craftsman.utils.checkpoint import checkpoint from .utils import init_linear, MLP from timm.models.vision_transformer import Attention -def scaled_dot_product_gqa( - query: Tensor, - key: Tensor, - value: Tensor, - dropout: float = 0.0, - scale: Optional[float] = None, - mask: Optional[Tensor] = None, - is_causal: Optional[bool] = None, - need_weights: bool = False, - average_attn_weights: bool = False, - force_grouped: bool = False, -): - """Scaled dot product attention with support for grouped queries. - - Einstein notation: - - b: batch size - - n / s: sequence length - - h: number of heads - - g: number of groups - - d: dimension of query/key/value - - Args: - query: Query tensor of shape (b, n, h, d) - key: Key tensor of shape (b, s, h, d) - value: Value tensor of shape (b, s, h, d) - dropout: Dropout probability (default: 0.0) - scale: Scale factor for query (default: d_query ** 0.5) - mask: Mask tensor of shape (b, n, s) or (b, s). If 'ndim == 2', the mask is - applied to all 'n' rows of the attention matrix. (default: None) - force_grouped: If True, apply grouped-query attention even if the number of - heads is equal for query, key, and value. (default: False) - - Returns: - 2-tuple of: - - Attention output with shape (b, n, h, d) - - (Optional) Attention weights with shape (b, h, n, s). Only returned if - 'need_weights' is True. - """ - if (mask is not None) and (is_causal is not None): - raise ValueError( - "Only one of 'mask' and 'is_causal' should be provided, but got both." - ) - elif not query.ndim == key.ndim == value.ndim == 4: - raise ValueError( - f"Expected query, key, and value to be 4-dimensional, but got shapes " - f"{query.shape}, {key.shape}, and {value.shape}." - ) - - # Move sequence length dimension to axis 2. - # This makes the attention operations below *much* faster. - query = rearrange(query, "b n h d -> b h n d") - key = rearrange(key, "b s h d -> b h s d") - value = rearrange(value, "b s h d -> b h s d") - - bq, hq, nq, dq = query.shape - bk, hk, nk, dk = key.shape - bv, hv, nv, dv = value.shape - if not (bq == bk == bv and dq == dk == dv): - raise ValueError( - "Expected query, key, and value to have the same batch size (dim=0) and " - f"embedding dimension (dim=3), but got query: {query.shape}, " - f"key: {key.shape}, and value: {value.shape}." - ) - elif (hk != hv) or (nk != nv): - raise ValueError( - "Expected key and value to have the same size in dimensions 1 and 2, but " - f"got key: {key.shape} and value: {value.shape}." - ) - elif hq % hk != 0: - raise ValueError( - "Expected query heads to be a multiple of key/value heads, but got " - f"query: {query.shape} and key/value: {key.shape}." - ) - - if scale is None: - scale = query.size(-1) ** 0.5 - query = query / scale - - num_head_groups = hq // hk - query = rearrange(query, "b (h g) n d -> b g h n d", g=num_head_groups) - similarity = einsum(query, key, "b g h n d, b h s d -> b g h n s") - - if is_causal: - # Mask out the upper triangular portion of the attention matrix. This prevents - # the model from attending to tokens in the future. - mask = torch.ones((bq, nq, nk), device=query.device, dtype=torch.bool).tril_() - - if mask is not None: - # Expand mask to match the shape of the attention matrix. - # If mask is 2D, assume that it is applied to the key/value sequence dimension. - # Else if mask is 3D, assume that it is applied to the query/key/value sequence - # dimension for all attention heads. - # - if mask.ndim == 2: - mask = rearrange(mask, "b s -> b () () () s") - elif mask.ndim == 3: - mask = rearrange(mask, "b n s -> b () () n s") - # Mask similarity values by setting them to negative infinity. This guarantees - # that they will not contribute to the softmax computation below. - similarity.masked_fill_(~mask, torch.finfo(similarity.dtype).min) - - attention = F.softmax(similarity, dim=-1) - if dropout > 0.0: - attention = F.dropout(attention, p=dropout) - - # Apply attention matrix to the value Tensor. - out = einsum(attention, value, "b g h n s, b h s d -> b g h n d") - # Move head dimension back to axis 2 - out = rearrange(out, "b g h n d -> b n (h g) d") - - attn_weights: Optional[Tensor] = None - if need_weights: - # Move the sequence dimensions back to positions 1, 2. Move the head dimension - # to position 3. This more closely matches the return shape of the attention - # output: (b, n, h, d). - attn_weights = rearrange(attention, "b g h n s -> b n s (h g)") - if average_attn_weights: - attn_weights = attn_weights.mean(dim=1) - - return out, attn_weights class MultiheadAttention(nn.Module): def __init__( @@ -327,4 +207,4 @@ class ResidualCrossAttentionBlock(nn.Module): def forward(self, x: torch.Tensor, data: torch.Tensor): x = x + self.attn(self.ln_1(x), self.ln_2(data)) x = x + self.mlp(self.ln_3(x)) - return x \ No newline at end of file + return x diff --git a/craftsman/models/transformers/perceiver_1d.py b/craftsman/models/transformers/perceiver_1d.py old mode 100644 new mode 100755 diff --git a/craftsman/models/transformers/utils.py b/craftsman/models/transformers/utils.py old mode 100644 new mode 100755 diff --git a/craftsman/pipeline.py b/craftsman/pipeline.py old mode 100644 new mode 100755 index 15edafd62330df4c7d3d282ab09bcd228a6353b4..a0d62fe0990761f59dc4e7f1fe79ddea4bf7c04d --- a/craftsman/pipeline.py +++ b/craftsman/pipeline.py @@ -158,6 +158,7 @@ class CraftsManPipeline(): background_color: List[int] = [255, 255, 255], foreground_ratio: float = 0.95, mc_depth: int = 8, + only_max_component: bool = False, ): r""" Function invoked when calling the pipeline for generation. @@ -198,6 +199,9 @@ class CraftsManPipeline(): mc_depth (`int`, *optional*, defaults to 8): The resolution of the Marching Cubes algorithm. The resolution is the number of cubes in the x, y, and z. 8 means 2^8 = 256 cubes in each dimension. The higher the resolution, the more detailed the mesh will be. + only_max_component (`bool`, *optional*, defaults to `False`): + Whether to only keep the largest connected component of the mesh. This is useful when the mesh has + multiple components and only the largest one is needed. Examples: Returns: @@ -258,6 +262,15 @@ class CraftsManPipeline(): if output_type == "trimesh": import trimesh cur_mesh = trimesh.Trimesh(vertices=mesh_v_f[0][0], faces=mesh_v_f[0][1]) + if only_max_component: + components = cur_mesh.split(only_watertight=False) + bbox = [] + for c in components: + bbmin = c.vertices.min(0) + bbmax = c.vertices.max(0) + bbox.append((bbmax - bbmin).max()) + max_component = np.argmax(bbox) + cur_mesh = components[max_component] mesh.append(cur_mesh) elif output_type == "np": mesh.append(mesh_v_f[0]) diff --git a/craftsman/systems/__pycache__/__init__.cpython-310.pyc b/craftsman/systems/__pycache__/__init__.cpython-310.pyc index a33a8274329244348289bbfd7de12c4a6533a0d1..275a97fa604c9525eb8a9aeb5cc7f70394a6f088 100644 Binary files a/craftsman/systems/__pycache__/__init__.cpython-310.pyc and b/craftsman/systems/__pycache__/__init__.cpython-310.pyc differ diff --git a/craftsman/systems/__pycache__/base.cpython-310.pyc b/craftsman/systems/__pycache__/base.cpython-310.pyc index 368251a7d4d3b344031271b791f22c803dbea049..22227831b79934f817e5104d93855117bd837fe5 100644 Binary files a/craftsman/systems/__pycache__/base.cpython-310.pyc and b/craftsman/systems/__pycache__/base.cpython-310.pyc differ diff --git a/craftsman/systems/__pycache__/pixart_diffusion.cpython-310.pyc b/craftsman/systems/__pycache__/pixart_diffusion.cpython-310.pyc index 3dc2e3115a0738aa6d00e90c50b83cae66274e60..355b9b8ad90cef4e78e8a19d0489ed73aced96e0 100644 Binary files a/craftsman/systems/__pycache__/pixart_diffusion.cpython-310.pyc and b/craftsman/systems/__pycache__/pixart_diffusion.cpython-310.pyc differ diff --git a/craftsman/systems/__pycache__/shape_autoencoder.cpython-310.pyc b/craftsman/systems/__pycache__/shape_autoencoder.cpython-310.pyc index 12bf192a5b35186b6aa033695f0ea61c3272a554..71d14323b7666b642eb2b2a867484678f42eb5d4 100644 Binary files a/craftsman/systems/__pycache__/shape_autoencoder.cpython-310.pyc and b/craftsman/systems/__pycache__/shape_autoencoder.cpython-310.pyc differ diff --git a/craftsman/systems/__pycache__/utils.cpython-310.pyc b/craftsman/systems/__pycache__/utils.cpython-310.pyc old mode 100644 new mode 100755 diff --git a/craftsman/systems/pixart_diffusion.py b/craftsman/systems/pixart_diffusion.py old mode 100644 new mode 100755 index 529167af2542f7d857defa39074e8047817b0b8c..93466315855a0b1b93d44e8977eeeda103f95fe5 --- a/craftsman/systems/pixart_diffusion.py +++ b/craftsman/systems/pixart_diffusion.py @@ -251,9 +251,9 @@ class PixArtDiffusionSystem(BaseSystem): return { "loss_diffusion": loss, "latents": latents, - "x_t": x_t, + "x_t": noisy_z, "noise": noise, - "noise_pred": pred_noise, + "noise_pred": noise_pred, "timesteps": timesteps, } @@ -373,4 +373,4 @@ class PixArtDiffusionSystem(BaseSystem): return outputs def on_validation_epoch_end(self): - pass \ No newline at end of file + pass diff --git a/craftsman/utils/__pycache__/__init__.cpython-310.pyc b/craftsman/utils/__pycache__/__init__.cpython-310.pyc index b9c8c4a21e1ddb79aab68a042d8c41e41bae9ad4..dc8fc56aa251450661270183a09d2d2ef7ce4e3c 100644 Binary files a/craftsman/utils/__pycache__/__init__.cpython-310.pyc and b/craftsman/utils/__pycache__/__init__.cpython-310.pyc differ diff --git a/craftsman/utils/__pycache__/__init__.cpython-311.pyc b/craftsman/utils/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d176e286f072ec0c415f0db0f8c81b0967893b89 Binary files /dev/null and b/craftsman/utils/__pycache__/__init__.cpython-311.pyc differ diff --git a/craftsman/utils/__pycache__/base.cpython-310.pyc b/craftsman/utils/__pycache__/base.cpython-310.pyc index 83a68795573ff8940846efa2c7d038d175d73b83..853925fb34f1b0e3cf043bcaaae367591f543ccd 100644 Binary files a/craftsman/utils/__pycache__/base.cpython-310.pyc and b/craftsman/utils/__pycache__/base.cpython-310.pyc differ diff --git a/craftsman/utils/__pycache__/base.cpython-311.pyc b/craftsman/utils/__pycache__/base.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..83f500bde0064775718572f0abc99760debdb29f Binary files /dev/null and b/craftsman/utils/__pycache__/base.cpython-311.pyc differ diff --git a/craftsman/utils/__pycache__/checkpoint.cpython-310.pyc b/craftsman/utils/__pycache__/checkpoint.cpython-310.pyc index bd5b481c93c34571f33c0fc48ef145ed23b1fa58..fed9e95b7e88f3a21a4f3816a7f2ae36b5a1b295 100644 Binary files a/craftsman/utils/__pycache__/checkpoint.cpython-310.pyc and b/craftsman/utils/__pycache__/checkpoint.cpython-310.pyc differ diff --git a/craftsman/utils/__pycache__/config.cpython-310.pyc b/craftsman/utils/__pycache__/config.cpython-310.pyc index aac3aef5468c180268a0c8c52a1707e60d870822..45270df949585d3d479f430499777ffa86b42f68 100644 Binary files a/craftsman/utils/__pycache__/config.cpython-310.pyc and b/craftsman/utils/__pycache__/config.cpython-310.pyc differ diff --git a/craftsman/utils/__pycache__/config.cpython-311.pyc b/craftsman/utils/__pycache__/config.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f70d9dc896ed0327846e4140fc2261f359ad954c Binary files /dev/null and b/craftsman/utils/__pycache__/config.cpython-311.pyc differ diff --git a/craftsman/utils/__pycache__/misc.cpython-310.pyc b/craftsman/utils/__pycache__/misc.cpython-310.pyc index 3adf88e303ffbe6e877b42cbadc7f326fcbad751..b6c11bf8560b424fcd6f45d5da9202ddcde69eae 100644 Binary files a/craftsman/utils/__pycache__/misc.cpython-310.pyc and b/craftsman/utils/__pycache__/misc.cpython-310.pyc differ diff --git a/craftsman/utils/__pycache__/ops.cpython-310.pyc b/craftsman/utils/__pycache__/ops.cpython-310.pyc index debfa41eae014018d9d0ba540248b2f77de3c23b..e8bdd120d0dc5c632d918740c6eeab3e0fb99f68 100644 Binary files a/craftsman/utils/__pycache__/ops.cpython-310.pyc and b/craftsman/utils/__pycache__/ops.cpython-310.pyc differ diff --git a/craftsman/utils/__pycache__/saving.cpython-310.pyc b/craftsman/utils/__pycache__/saving.cpython-310.pyc index 4b38d835c68dc5a67dad09beea97d2bb2f47fcf4..a8fa8a52d8589ce573de44a75b1ebe40fdf99c9f 100644 Binary files a/craftsman/utils/__pycache__/saving.cpython-310.pyc and b/craftsman/utils/__pycache__/saving.cpython-310.pyc differ diff --git a/craftsman/utils/__pycache__/scheduler.cpython-310.pyc b/craftsman/utils/__pycache__/scheduler.cpython-310.pyc index d7f9b4184fd39cf6b2df5076a97681f89ea577fe..7f65835c4ecd6f9233a169a24e5e14184a122ca4 100644 Binary files a/craftsman/utils/__pycache__/scheduler.cpython-310.pyc and b/craftsman/utils/__pycache__/scheduler.cpython-310.pyc differ diff --git a/craftsman/utils/__pycache__/typing.cpython-310.pyc b/craftsman/utils/__pycache__/typing.cpython-310.pyc index 24970b9142feaba6029154274a042c4c34fc6e92..79b8dae01c4afaf63968f2bb64582ba8204f29a1 100644 Binary files a/craftsman/utils/__pycache__/typing.cpython-310.pyc and b/craftsman/utils/__pycache__/typing.cpython-310.pyc differ diff --git a/craftsman/utils/__pycache__/typing.cpython-311.pyc b/craftsman/utils/__pycache__/typing.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..448acbdfe51e93efe4f6efbc5fa514db87a3902c Binary files /dev/null and b/craftsman/utils/__pycache__/typing.cpython-311.pyc differ diff --git a/docker/requirements.txt b/docker/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..297f4a1b96ee50046dba8c079bdf3165c1f66728 --- /dev/null +++ b/docker/requirements.txt @@ -0,0 +1,34 @@ +datasets==2.19.0 +diffusers==0.31.0 +einops==0.8.0 +huggingface-hub==0.26.2 +imageio==2.34.1 +jaxtyping==0.2.28 +joblib==1.4.0 +lightning-utilities==0.11.2 +matplotlib==3.8.4 +numpy==1.26.4 +omegaconf==2.3.0 +opencv-python==4.9.0.80 +pandas==2.2.2 +pillow==10.3.0 +plyfile==1.0.3 +PyMCubes==0.1.4 +pyparsing==3.1.2 +pytorch-lightning==2.2.4 +PyYAML==6.0.1 +safetensors==0.4.3 +scikit-image==0.23.2 +scipy==1.13.0 +tensorboard==2.16.2 +tensorboardX==2.6.2.2 +timm==0.9.16 +tokenizers==0.19.1 +tqdm==4.66.2 +transformers==4.40.1 +trimesh==4.3.2 +spaces==0.28.3 +accelerate==0.29.1 +rembg==2.0.59 +gradio==5.5.0 +wandb==0.18.6 \ No newline at end of file diff --git a/server.py b/server.py new file mode 100644 index 0000000000000000000000000000000000000000..614a21e19baac9991c6a16871cb0cb7f340797eb --- /dev/null +++ b/server.py @@ -0,0 +1,98 @@ +import argparse +import base64 +import os +from datetime import datetime +import traceback +import torch +import trimesh +from craftsman import CraftsManPipeline + +CURRENT_DIR = f'/tmp/native3d_server/{os.getpid()}' +os.makedirs(CURRENT_DIR, exist_ok=True) + +def parse_parameters(): + parser = argparse.ArgumentParser("native3d") + parser.add_argument('--host', default="0.0.0.0", type=str) + parser.add_argument('--port', default=80, type=int) + return parser.parse_args() + +# -------------------- fastapi -------------------- +from typing import Optional +from pydantic import BaseModel, Field + +class Native3DRequestV1(BaseModel): + image_path: str # input image path + mesh_path: str # output mesh path, support glb or obj in clean dir + +class Native3DResponseV1(BaseModel): + pass + +class Native3DRequestV2(BaseModel): + image_bytes: str # input image bytes(base64) + mesh_type: str # output mesh type, support glb or obj + +class Native3DResponseV2(BaseModel): + mesh_bytes: str # output mesh bytes(base64) + +if __name__=="__main__": + parse_args = parse_parameters() + + # prepare models + pipeline = CraftsManPipeline.from_pretrained("ckpts/craftsman-v1-5", device="cuda:0", torch_dtype=torch.float32) + + # -------------------- fastapi -------------------- + from fastapi import FastAPI, Request + import requests + app = FastAPI() + + @app.post("/native3d_v1", response_model=Native3DResponseV1) + async def native3d(request: Request, image_to_mesh_request: Native3DRequestV1): + try: + print(f"image_to_mesh_request = {image_to_mesh_request}") + mesh = pipeline(image_to_mesh_request.image_path).meshes[0] + os.makedirs(os.path.dirname(os.path.abspath(image_to_mesh_request.mesh_path)), exist_ok=True) + mesh.export(image_to_mesh_request.mesh_path) + except Exception as e: + traceback.print_exc() + print(f"generate_model error: {e}") + return Native3DResponseV1() + + @app.post("/native3d_v2", response_model=Native3DResponseV2) + async def native3d(request: Request, image_to_mesh_request: Native3DRequestV2): + try: + # print(f"image_to_mesh_request = {image_to_mesh_request}") + mesh_type = image_to_mesh_request.mesh_type + assert mesh_type in ['obj', 'glb'] + task_id = datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f') + '-' + 'native3d' + current_dir = os.path.join(CURRENT_DIR, task_id) + os.makedirs(current_dir, exist_ok=True) + image_path = os.path.join(current_dir, 'input_image.png') + with open(image_path, 'wb') as f: + f.write(base64.b64decode(image_to_mesh_request.image_bytes)) + mesh_path = os.path.join(current_dir, f'output_mesh.{mesh_type}') + import time + start = time.time() + # mesh = pipeline(image_path, mc_depth=8, num_inference_steps=25).meshes[0] + # mesh = pipeline(image_path, mc_depth=7, num_inference_steps=25).meshes[0] + mesh = pipeline(image_path, mc_depth=7, num_inference_steps=50).meshes[0] + print(f"Time: {time.time() - start}s") + os.makedirs(os.path.dirname(os.path.abspath(mesh_path)), exist_ok=True) + mesh.visual = trimesh.visual.TextureVisuals( + material=trimesh.visual.material.PBRMaterial( + baseColorFactor=(255, 255, 255), main_color=(255, 255, 255), metallicFactor=0.05, roughnessFactor=1.0 + ) + ) + mesh.export(mesh_path) + with open(mesh_path, 'rb') as f: + mesh_bytes = f.read() + except Exception as e: + traceback.print_exc() + print(f"generate_model error: {e}") + return Native3DResponseV2(mesh_bytes=base64.b64encode(mesh_bytes).decode('utf-8')) + + @app.get("/health") + async def health(): + return {"status": "OK"} + + import uvicorn + uvicorn.run(app, host=parse_args.host, port=parse_args.port)