diff --git a/ckpts/.gitattributes b/ckpts/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b
--- /dev/null
+++ b/ckpts/.gitattributes
@@ -0,0 +1,35 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
diff --git a/ckpts/README.md b/ckpts/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8cf968a395bf8fa4fb7f36da5c429a0560165bfe
--- /dev/null
+++ b/ckpts/README.md
@@ -0,0 +1,117 @@
+---
+library_name: craftsman-v1-5
+license: creativeml-openrail-m
+license_name: creativeml-openrail-m
+license_link: https://raw.githubusercontent.com/CompVis/stable-diffusion/refs/heads/main/LICENSE
+pipeline_tag: image-to-3d
+language:
+- en
+- zh
+---
+
+## **CraftsMan-v1-5**
+
+<p align="center">
+  <img src="./assets/teaser.png"  height=200>
+</p>
+
+### <div align="center">CraftsMan: High-fidelity Mesh Generation <br> with 3D Native Generation and Interactive Geometry Refiner<div> 
+#####  <p align="center"> [Weiyu Li<sup>*1,2</sup>](https://wyysf-98.github.io/), Jiarui Liu<sup>*1,2</sup>, Hongyu Yan<sup>*1,2</sup>, [Rui Chen<sup>1,2</sup>](https://aruichen.github.io/), [Yixun Liang<sup>2,3</sup>](https://yixunliang.github.io/), [Xuelin Chen<sup>4</sup>](https://xuelin-chen.github.io/), [Ping Tan<sup>1,2</sup>](https://ece.hkust.edu.hk/pingtan), [Xiaoxiao Long<sup>1,2</sup>](https://www.xxlong.site/)</p>
+#####  <p align="center"> <sup>1</sup>HKUST, <sup>2</sup>LightIllusions, <sup>3</sup>HKUST(GZ), <sup>4</sup>Tencent AI Lab</p>
+<div align="center">
+  <a href="https://craftsman3d.github.io/"><img src="https://img.shields.io/static/v1?label=Project%20Page&message=Github&color=blue&logo=github-pages"></a> &ensp;
+  <a href="http://algodemo.bj.lightions.top:24926"><img src="https://www.gradio.app/_app/immutable/assets/gradio.CHB5adID.svg" height="25"/></a> &ensp;
+  <a href="https://arxiv.org/pdf/2405.14979"><img src="https://img.shields.io/static/v1?label=Paper&message=Arxiv&color=red&logo=arxiv"></a> &ensp;
+</div>
+
+# Usage
+
+To use the model, please refer to the [official repository](https://github.com/wyysf-98/CraftsMan) for installation and usage instructions.
+
+```
+
+from craftsman import CraftsManPipeline
+import torch
+
+pipeline = CraftsManPipeline.from_pretrained("./ckpts/craftsman-v1-5", device="cuda:0", torch_dtype=torch.float32) # load from local ckpt
+mesh = pipeline("https://pub-f9073a756ec645d692ce3d171c2e1232.r2.dev/data/werewolf.png").meshes[0]
+mesh.export("werewolf.obj")
+
+```
+
+## 🔥🔥🔥 News!!
+
+* Nov 16, 2024: 💬 We release the CraftsMan-v1-5
+
+
+## 📑 Open-source Plan
+
+- [x] Inference 
+- [x] Checkpoints
+- [x] Training
+- [ ] ComfyUI
+
+## 🎉 **CraftMan-v1-5 Architecture**
+
+<p align="center">
+  <img src="./assets/arch.png"  height=400>
+</p>
+
+
+## Get Started
+
+#### Begin by cloning the repository:
+
+```shell
+git clone https://github.com/wyysf-98/CraftsMan
+cd CraftsMan
+```
+
+#### Installation Guide for Linux
+
+We provide an env_install.sh script file for setting up environment. 
+
+```
+# step 1, create conda env
+conda create -n CraftsMan python=3.10
+conda activate CraftsMan
+
+
+# step 2. install torch realated package
+conda install -c pytorch pytorch=2.3.0 torchvision=0.18.0 cudatoolkit=11.8
+
+# step 3. install other packages
+pip install -r docker/requirements.txt
+```
+<details>
+
+
+#### Using Gradio
+
+We have prepared a gradio demo for you to try out the model. You can run the following command to start the demo.
+
+```shell
+# std 
+python3 gradio.py
+```
+
+Then the demo can be accessed through the output link.
+
+
+## Citation
+
+If you found this repository helpful, please cite our report:
+```bibtex
+@misc{li2024craftsman,
+title         = {CraftsMan: High-fidelity Mesh Generation with 3D Native Generation and Interactive Geometry Refiner}, 
+author        = {Weiyu Li and Jiarui Liu and Rui Chen and Yixun Liang and Xuelin Chen and Ping Tan and Xiaoxiao Long},
+year          = {2024},
+archivePrefix = {arXiv preprint arXiv:2405.14979},
+primaryClass  = {cs.CG}
+}
+```
+
+
+# License
+
+[creativeml-openrail-m](https://raw.githubusercontent.com/CompVis/stable-diffusion/refs/heads/main/LICENSE)
\ No newline at end of file
diff --git a/ckpts/craftsman-v1-5 b/ckpts/craftsman-v1-5
new file mode 160000
index 0000000000000000000000000000000000000000..9a5e9189c2dfab20cf838885dd6acaf99b41844e
--- /dev/null
+++ b/ckpts/craftsman-v1-5
@@ -0,0 +1 @@
+Subproject commit 9a5e9189c2dfab20cf838885dd6acaf99b41844e
diff --git a/craftsman/__pycache__/__init__.cpython-310.pyc b/craftsman/__pycache__/__init__.cpython-310.pyc
index 5a51ef6f8fe535a1397913052837454aaa7d53f7..a8785076ecfc8e97f756e760ae2bb5095879ffa7 100644
Binary files a/craftsman/__pycache__/__init__.cpython-310.pyc and b/craftsman/__pycache__/__init__.cpython-310.pyc differ
diff --git a/craftsman/__pycache__/__init__.cpython-311.pyc b/craftsman/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fc69ac74bbcfa36fde97eb2762a774a67d228bc5
Binary files /dev/null and b/craftsman/__pycache__/__init__.cpython-311.pyc differ
diff --git a/craftsman/__pycache__/pipeline.cpython-310.pyc b/craftsman/__pycache__/pipeline.cpython-310.pyc
index 11f8f99c630037594255abc91320ca76286dcedb..e4fdf0d89b5c80301e20cb0b7a894bf4331efa14 100644
Binary files a/craftsman/__pycache__/pipeline.cpython-310.pyc and b/craftsman/__pycache__/pipeline.cpython-310.pyc differ
diff --git a/craftsman/__pycache__/pipeline.cpython-311.pyc b/craftsman/__pycache__/pipeline.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..80f55005a19035f7b9fa5f929d493aa8e071f7d6
Binary files /dev/null and b/craftsman/__pycache__/pipeline.cpython-311.pyc differ
diff --git a/craftsman/data/__pycache__/Objaverse.cpython-310.pyc b/craftsman/data/__pycache__/Objaverse.cpython-310.pyc
index e386501d4e556a0b58284856146c0cc902fe116c..5a3e4cd7808df822b968a71f2d68a14dc29933d7 100644
Binary files a/craftsman/data/__pycache__/Objaverse.cpython-310.pyc and b/craftsman/data/__pycache__/Objaverse.cpython-310.pyc differ
diff --git a/craftsman/data/__pycache__/__init__.cpython-310.pyc b/craftsman/data/__pycache__/__init__.cpython-310.pyc
index a3984e5da4f87d7c9daa51190557b2d1b52905ae..0337eaaf4af9d87488de122e627d45f4ae88ecef 100644
Binary files a/craftsman/data/__pycache__/__init__.cpython-310.pyc and b/craftsman/data/__pycache__/__init__.cpython-310.pyc differ
diff --git a/craftsman/data/__pycache__/base.cpython-310.pyc b/craftsman/data/__pycache__/base.cpython-310.pyc
index 598c77443204d9b4fedab25daee82170de9c59e0..e04ec395e61d3acc020c5ad560f1e7594055799c 100644
Binary files a/craftsman/data/__pycache__/base.cpython-310.pyc and b/craftsman/data/__pycache__/base.cpython-310.pyc differ
diff --git a/craftsman/data/base.py b/craftsman/data/base.py
index a89fcbf466c1af978e498fb383eb52df10bc8d82..8b530cc41c75f9beab62c01760537ea3c8a60f9d 100755
--- a/craftsman/data/base.py
+++ b/craftsman/data/base.py
@@ -53,7 +53,7 @@ class BaseDataModuleConfig:
     # for occupancy and sdf data
     n_samples: int = 4096                # number of points in input point cloud
     upsample_ratio: int = 1              # upsample ratio for input point cloud
-    sampling_strategy: str = "random"    # sampling strategy for input point cloud
+    sampling_strategy: Optional[str] = None    # sampling strategy for input point cloud
     scale: float = 1.0                   # scale of the input point cloud and target supervision
     load_supervision: bool = True        # whether to load supervision
     supervision_type: str = "occupancy"  # occupancy, sdf, tsdf
@@ -70,6 +70,8 @@ class BaseDataModuleConfig:
     idx: Optional[List[int]] = None      # index of the image to load
     n_views: int = 1                     # number of views
     marign_pix_dis: int = 30             # margin of the bounding box
+    batch_size: int = 32
+    num_workers: int = 8
 
 
 class BaseDataset(Dataset):
@@ -78,7 +80,7 @@ class BaseDataset(Dataset):
         self.cfg: BaseDataModuleConfig = cfg
         self.split = split
 
-        self.uids = json.load(open(f'{cfg.root_dir}/{split}.json'))
+        self.uids = json.load(open(f'{cfg.local_dir}/{split}.json'))
         print(f"Loaded {len(self.uids)} {split} uids")
     
     def __len__(self):
@@ -94,10 +96,7 @@ class BaseDataset(Dataset):
             surface = np.concatenate([surface, normal], axis=1)
         elif self.cfg.geo_data_type == "sdf":
             # for sdf data with our own format
-            if re.match(r"\.\.", self.uids[index]):
-                data = np.load(f'{self.cfg.geo_data_path}/{self.uids[index]}.npz')
-            else:
-                data = np.load(f'{self.uids[index]}.npz')
+            data = np.load(f'{self.cfg.geo_data_path}/{self.uids[index]}.npz')
             # for input point cloud
             surface = data["surface"]
         else:
@@ -112,6 +111,8 @@ class BaseDataset(Dataset):
             import fpsample
             kdline_fps_samples_idx = fpsample.bucket_fps_kdline_sampling(surface[:, :3], self.cfg.n_samples, h=5)
             surface = surface[kdline_fps_samples_idx]
+        elif self.cfg.sampling_strategy is None:
+            pass
         else:
             raise NotImplementedError(f"sampling strategy {self.cfg.sampling_strategy} not implemented")
         # rescale data
@@ -189,9 +190,9 @@ class BaseDataset(Dataset):
             sel_idx = random.choice(self.cfg.idx)
             ret["sel_image_idx"] = sel_idx
             if self.cfg.image_type == "rgb":
-                img_path = f'{self.cfg.image_data_path}/' + "/".join(self.uids[index].split('/')[-2:]) + f"/{'{:04d}'.format(sel_idx)}_rgb.png"
+                img_path = f'{self.cfg.image_data_path}/' + "/".join(self.uids[index].split('/')[-2:]) + f"/{'{:04d}'.format(sel_idx)}_rgb.jpeg"
             elif self.cfg.image_type == "normal":
-                img_path = f'{self.cfg.image_data_path}/' + "/".join(self.uids[index].split('/')[-2:]) + f"/{'{:04d}'.format(sel_idx)}_normal.png"
+                img_path = f'{self.cfg.image_data_path}/' + "/".join(self.uids[index].split('/')[-2:]) + f"/{'{:04d}'.format(sel_idx)}_normal.jpeg"
             ret["image"], ret["mask"] = _load_single_image(img_path, background_color, self.cfg.marign_pix_dis)
 
         else:
diff --git a/craftsman/models/__pycache__/__init__.cpython-310.pyc b/craftsman/models/__pycache__/__init__.cpython-310.pyc
index 6f2a4aa9900ea2774cdf420f186133bc7841c02a..11adffc179d52bd2723fda25a7c93449622bc2cb 100644
Binary files a/craftsman/models/__pycache__/__init__.cpython-310.pyc and b/craftsman/models/__pycache__/__init__.cpython-310.pyc differ
diff --git a/craftsman/models/autoencoders/__pycache__/__init__.cpython-310.pyc b/craftsman/models/autoencoders/__pycache__/__init__.cpython-310.pyc
index 816e8317dfc4c21814e583abccec00f656238ef0..4dc405b92470faf12abfefebf27d6639c0c42d40 100644
Binary files a/craftsman/models/autoencoders/__pycache__/__init__.cpython-310.pyc and b/craftsman/models/autoencoders/__pycache__/__init__.cpython-310.pyc differ
diff --git a/craftsman/models/autoencoders/__pycache__/michelangelo_autoencoder.cpython-310.pyc b/craftsman/models/autoencoders/__pycache__/michelangelo_autoencoder.cpython-310.pyc
index 6dc2375a7d7d199b2d62a491b5a98b85c9587634..842c373c10f5f0f6ec7f42e2fbde457bd549bda2 100644
Binary files a/craftsman/models/autoencoders/__pycache__/michelangelo_autoencoder.cpython-310.pyc and b/craftsman/models/autoencoders/__pycache__/michelangelo_autoencoder.cpython-310.pyc differ
diff --git a/craftsman/models/conditional_encoders/__pycache__/__init__.cpython-310.pyc b/craftsman/models/conditional_encoders/__pycache__/__init__.cpython-310.pyc
index 5cdb9960006c8152e0087a74620d445905ca85c4..a95057bd6e6ca2a1ba21b989d773dd4832be6e79 100644
Binary files a/craftsman/models/conditional_encoders/__pycache__/__init__.cpython-310.pyc and b/craftsman/models/conditional_encoders/__pycache__/__init__.cpython-310.pyc differ
diff --git a/craftsman/models/conditional_encoders/__pycache__/base.cpython-310.pyc b/craftsman/models/conditional_encoders/__pycache__/base.cpython-310.pyc
index 1bd6963909f1f914109d6babaea136f7b82e87f9..1ca2aefda5cd18199e0848691a3271dba7fac3eb 100644
Binary files a/craftsman/models/conditional_encoders/__pycache__/base.cpython-310.pyc and b/craftsman/models/conditional_encoders/__pycache__/base.cpython-310.pyc differ
diff --git a/craftsman/models/conditional_encoders/__pycache__/cond_encoder.cpython-310.pyc b/craftsman/models/conditional_encoders/__pycache__/cond_encoder.cpython-310.pyc
index 1aa092ceb6e5e4d1b03c3ad1c906ba5a44f89763..45bca7ab683148d63f794b2f29a550371805ee1e 100644
Binary files a/craftsman/models/conditional_encoders/__pycache__/cond_encoder.cpython-310.pyc and b/craftsman/models/conditional_encoders/__pycache__/cond_encoder.cpython-310.pyc differ
diff --git a/craftsman/models/conditional_encoders/clip/__pycache__/modeling_clip.cpython-310.pyc b/craftsman/models/conditional_encoders/clip/__pycache__/modeling_clip.cpython-310.pyc
index 9b2783cc2194ea5717c0037b3b33547d75372724..45e0f8eff2e57853db37a7bbacb4bfbc213d03a2 100644
Binary files a/craftsman/models/conditional_encoders/clip/__pycache__/modeling_clip.cpython-310.pyc and b/craftsman/models/conditional_encoders/clip/__pycache__/modeling_clip.cpython-310.pyc differ
diff --git a/craftsman/models/conditional_encoders/clip/__pycache__/modeling_conditional_clip.cpython-310.pyc b/craftsman/models/conditional_encoders/clip/__pycache__/modeling_conditional_clip.cpython-310.pyc
index 4f0517d090506cdf564f4e2c281cc1f4e7db2667..982cdf8a3dffaba1863b7eb59dcfe55923e82a06 100644
Binary files a/craftsman/models/conditional_encoders/clip/__pycache__/modeling_conditional_clip.cpython-310.pyc and b/craftsman/models/conditional_encoders/clip/__pycache__/modeling_conditional_clip.cpython-310.pyc differ
diff --git a/craftsman/models/conditional_encoders/cond_encoder.py b/craftsman/models/conditional_encoders/cond_encoder.py
old mode 100644
new mode 100755
index 1faa5eb44fc421e84af7e8f5c54a4c4389396464..85c75e249bcda50d33c2dff9ea57c4f5b3ff3136
--- a/craftsman/models/conditional_encoders/cond_encoder.py
+++ b/craftsman/models/conditional_encoders/cond_encoder.py
@@ -46,7 +46,6 @@ class CondEmbedder(BaseEmbedder):
         enable_gradient_checkpointing: bool = False
         embeds_fusion_mode: int = 1 # 0: sum | 1: concat
         linear_proj_init: str = "constant"
-        text_model_type: str = "clip"
         text_max_length: int = 77
         image_size_clip: int = 224
         image_size_dino: int = 224
@@ -277,29 +276,9 @@ class CondEmbedder(BaseEmbedder):
         else:
             return vision_outputs.last_hidden_state
 
-    def post_process_embeds(self, text_embeds, visual_embeds):
-        clip_embeds, dino_embeds = visual_embeds.chunk(2, dim=2)
-        if self.cfg.normalize_embeds:
-            # post-process the text/visual embeds
-            if text_embeds is not None:
-                text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True)
-            if clip_embeds is not None:
-                clip_embeds = clip_embeds / clip_embeds.norm(dim=-1, keepdim=True)
-            if dino_embeds is not None:
-                dino_embeds = dino_embeds / dino_embeds.norm(dim=-1, keepdim=True)
-
-        assert text_embeds is not None or dino_embeds is not None or clip_embeds is not None
-            
-        if text_embeds is not None and visual_embeds is not None:
-            return torch.cat([text_embeds, visual_embeds], dim=1)
-        elif text_embeds is not None:
-            return text_embeds
-        else:
-            return visual_embeds
-
     def encode_image(self, images: Iterable[Optional[ImageType]], cameras: Optional[torch.Tensor] = None, force_none_camera_embeds: bool = False, return_dict: bool = False, **kwargs) -> torch.FloatTensor:
         clip_embeds = self.encode_image_clip(images, cameras)
         dino_embeds = self.encode_image_dino(images, cameras)
         dino_embeds = self.linear_proj(dino_embeds)
         visual_embeds = torch.cat([clip_embeds, dino_embeds], dim=1)
-        return visual_embeds
\ No newline at end of file
+        return visual_embeds
diff --git a/craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_conditional_dinov2.cpython-310.pyc b/craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_conditional_dinov2.cpython-310.pyc
index a8cc31eaf7786f8da942a578c437e98187aead95..e5febc512824e0aae8e91be9763a93fc54b69ddd 100644
Binary files a/craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_conditional_dinov2.cpython-310.pyc and b/craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_conditional_dinov2.cpython-310.pyc differ
diff --git a/craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_dinov2.cpython-310.pyc b/craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_dinov2.cpython-310.pyc
index ddafb045cd5591010924c1b676245338418afee4..2e015e8893c63efecead9dd44e7710d08de78ef5 100644
Binary files a/craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_dinov2.cpython-310.pyc and b/craftsman/models/conditional_encoders/dino_v2/__pycache__/modeling_dinov2.cpython-310.pyc differ
diff --git a/craftsman/models/denoisers/__pycache__/__init__.cpython-310.pyc b/craftsman/models/denoisers/__pycache__/__init__.cpython-310.pyc
index 7403ff05f007ce3dcd68a6b68853f6d9e2b9c765..821a33cc4bcdcbcd3b0bb42e5108209fdd5f10e5 100644
Binary files a/craftsman/models/denoisers/__pycache__/__init__.cpython-310.pyc and b/craftsman/models/denoisers/__pycache__/__init__.cpython-310.pyc differ
diff --git a/craftsman/models/denoisers/__pycache__/pixart_denoiser.cpython-310.pyc b/craftsman/models/denoisers/__pycache__/pixart_denoiser.cpython-310.pyc
index 64aab3f6a63158b80efed5c4714f60ef5cbd37c6..19b1973d0fdb7f557a328cea64758115cbee37a1 100644
Binary files a/craftsman/models/denoisers/__pycache__/pixart_denoiser.cpython-310.pyc and b/craftsman/models/denoisers/__pycache__/pixart_denoiser.cpython-310.pyc differ
diff --git a/craftsman/models/denoisers/__pycache__/utils.cpython-310.pyc b/craftsman/models/denoisers/__pycache__/utils.cpython-310.pyc
index 8a5b02a2bf10278ba7fc10ac184473d9712092ae..59f9f8adaf4e3ea92c8ef9a2bb65a733b26e9e33 100644
Binary files a/craftsman/models/denoisers/__pycache__/utils.cpython-310.pyc and b/craftsman/models/denoisers/__pycache__/utils.cpython-310.pyc differ
diff --git a/craftsman/models/denoisers/pixart_denoiser.py b/craftsman/models/denoisers/pixart_denoiser.py
index b34f1caa2d4bf6d8385106f619dcad6314cb7281..4c64be12d1e680a08f1917d3a8d1f6c2c93dc1f4 100755
--- a/craftsman/models/denoisers/pixart_denoiser.py
+++ b/craftsman/models/denoisers/pixart_denoiser.py
@@ -25,15 +25,11 @@ class PixArtDinoDenoiser(BaseModule):
         context_dim: int = 1024
         n_views: int = 1
         context_ln: bool = True
-        skip_ln: bool = False
         init_scale: float = 0.25
         use_checkpoint: bool = False
         drop_path: float = 0.
-        variance_type: str = ""
-        img_pos_embed: bool = False
         clip_weight: float = 1.0
         dino_weight: float = 1.0
-        dit_block: str = ""
 
     cfg: Config
 
@@ -63,9 +59,8 @@ class PixArtDinoDenoiser(BaseModule):
 
         init_scale = self.cfg.init_scale * math.sqrt(1.0 / self.cfg.width)
         drop_path = [x.item() for x in torch.linspace(0, self.cfg.drop_path, self.cfg.layers)]
-        ditblock =  getattr(importlib.import_module("craftsman.models.denoisers.utils"), self.cfg.dit_block)
         self.blocks = nn.ModuleList([
-            ditblock(
+            DiTBlock(
                     width=self.cfg.width, 
                     heads=self.cfg.heads, 
                     init_scale=init_scale, 
@@ -82,11 +77,7 @@ class PixArtDinoDenoiser(BaseModule):
                     )
         
          # final layer
-        if self.cfg.variance_type.upper() in ["LEARNED", "LEARNED_RANGE"]:
-            self.output_channels = self.cfg.output_channels * 2
-        else:
-            self.output_channels = self.cfg.output_channels
-        self.final_layer = T2IFinalLayer(self.cfg.width, self.output_channels)
+        self.final_layer = T2IFinalLayer(self.cfg.width, self.cfg.output_channels)
 
         self.identity_initialize()
 
@@ -99,17 +90,6 @@ class PixArtDinoDenoiser(BaseModule):
                     self.denoiser_ckpt[k.replace('denoiser_model.', '')] = v
             self.load_state_dict(self.denoiser_ckpt, strict=False)
 
-    def forward_with_dpmsolver(self, model_input, timestep, context):
-        """
-        dpm solver donnot need variance prediction
-        """
-        # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
-        model_out = self.forward(model_input, timestep, context)
-        if self.cfg.variance_type.upper() in ["LEARNED", "LEARNED_RANGE"]:
-            return model_out.chunk(2, dim=-1)[0]
-        else:
-            return model_out
-
     def identity_initialize(self):
         for block in self.blocks:
             nn.init.constant_(block.attn.c_proj.weight, 0)
diff --git a/craftsman/models/denoisers/utils.py b/craftsman/models/denoisers/utils.py
old mode 100644
new mode 100755
index 9e29cbbf784d8cfa0683eda7b7e72b0cf68356ce..ab03bbe930cff27fab9c45ffbf25220551e024cc
--- a/craftsman/models/denoisers/utils.py
+++ b/craftsman/models/denoisers/utils.py
@@ -10,126 +10,6 @@ from timm.models.layers import DropPath
 from craftsman.models.transformers.utils import MLP
 from craftsman.models.transformers.attention import MultiheadAttention, MultiheadCrossAttention
 
-class PatchEmbed(nn.Module):
-    """ 2D Image to Patch Embedding
-    """
-    def __init__(
-            self,
-            patch_size=16,
-            in_chans=3,
-            embed_dim=768,
-            norm_layer=None,
-            flatten=True,
-            bias=True,
-    ):
-        super().__init__()
-        patch_size = to_2tuple(patch_size)
-        self.patch_size = patch_size
-        self.flatten = flatten
-        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias)
-        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
-
-    def forward(self, x):
-        x = self.proj(x)
-        if self.flatten:
-            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
-        x = self.norm(x)
-        return x
-
-class DiTBlock(nn.Module):
-    """
-    A PixArt block with adaptive layer norm (adaLN-single) conditioning.
-    """
-
-    def __init__(self, width, heads, init_scale=1.0, qkv_bias=True, use_flash=True, drop_path=0.0):
-        super().__init__()
-        self.norm1 = nn.LayerNorm(width, elementwise_affine=True, eps=1e-6)
-        self.attn = MultiheadAttention(
-            n_ctx=None,
-            width=width,
-            heads=heads,
-            init_scale=init_scale,
-            qkv_bias=qkv_bias,
-            use_flash=use_flash
-        )
-        self.cross_attn = MultiheadCrossAttention(
-            n_data=None,
-            width=width,
-            heads=heads,
-            data_width=None,
-            init_scale=init_scale,
-            qkv_bias=qkv_bias,
-            use_flash=use_flash,
-        )
-
-        self.norm2 = nn.LayerNorm(width, elementwise_affine=True, eps=1e-6)
-
-        self.mlp = MLP(width=width, init_scale=init_scale)
-        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
-        self.scale_shift_table = nn.Parameter(torch.randn(6, width) / width ** 0.5)
-
-    def forward(self, x, visual_cond, t, **kwargs):
-        B, N, C = x.shape
-
-        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.scale_shift_table[None] + t.reshape(B, 6, -1)).chunk(6, dim=1)
-        x = x + self.drop_path(gate_msa * self.attn(t2i_modulate(self.norm1(x), shift_msa, scale_msa)).reshape(B, N, C))
-        x = x + self.cross_attn(x, visual_cond)
-        x = x + self.drop_path(gate_mlp * self.mlp(t2i_modulate(self.norm2(x), shift_mlp, scale_mlp)))
-
-        return x
-
-class DiTBlock_text(nn.Module):
-    """
-    A PixArt block with adaptive layer norm (adaLN-single) conditioning.
-    """
-
-    def __init__(self, width, heads, init_scale=1.0, qkv_bias=True, use_flash=True, drop_path=0.0):
-        super().__init__()
-        self.norm1 = nn.LayerNorm(width, elementwise_affine=True, eps=1e-6)
-        self.attn = MultiheadAttention(
-            n_ctx=None,
-            width=width,
-            heads=heads,
-            init_scale=init_scale,
-            qkv_bias=qkv_bias,
-            use_flash=use_flash
-        )
-        self.cross_attn = MultiheadCrossAttention(
-            n_data=None,
-            width=width,
-            heads=heads,
-            data_width=None,
-            init_scale=init_scale,
-            qkv_bias=qkv_bias,
-            use_flash=use_flash,
-        )
-
-        self.cross_attn_extra = MultiheadCrossAttention(
-            n_data=None,
-            width=width,
-            heads=heads,
-            data_width=None,
-            init_scale=init_scale,
-            qkv_bias=qkv_bias,
-            use_flash=use_flash,
-        )
-        self.norm2 = nn.LayerNorm(width, elementwise_affine=True, eps=1e-6)
-
-        self.mlp = MLP(width=width, init_scale=init_scale)
-        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
-        self.scale_shift_table = nn.Parameter(torch.randn(6, width) / width ** 0.5)
-
-    def forward(self, x, visual_cond, text_cond, t, **kwargs):
-        B, N, C = x.shape
-
-        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.scale_shift_table[None] + t.reshape(B, 6, -1)).chunk(6, dim=1)
-        x = x + self.drop_path(gate_msa * self.attn(t2i_modulate(self.norm1(x), shift_msa, scale_msa)).reshape(B, N, C))
-        x = x + self.cross_attn(x, visual_cond)
-        x = x + self.cross_attn_extra(x, text_cond)
-        x = x + self.drop_path(gate_mlp * self.mlp(t2i_modulate(self.norm2(x), shift_mlp, scale_mlp)))
-
-        return x
-
 class DiTBlock(nn.Module):
     """
     A DiT block with adaptive layer norm (adaLN-single) conditioning.
@@ -174,11 +54,6 @@ class DiTBlock(nn.Module):
 def t2i_modulate(x, shift, scale):
     return x * (1 + scale) + shift
 
-# def t2i_modulate(x, shift, scale):
-#     a = torch.ones_like(scale)
-#     a[..., 768:] = 0
-#     return x * (a + scale) + shift
-
 def auto_grad_checkpoint(module, *args, **kwargs):
     if getattr(module, 'grad_checkpointing', False):
         if not isinstance(module, Iterable):
@@ -268,63 +143,4 @@ class T2IFinalLayer(nn.Module):
         shift, scale = (self.scale_shift_table[None] + t[:, None]).chunk(2, dim=1)
         x = t2i_modulate(self.norm_final(x), shift, scale)
         x = self.linear(x)
-        return x
-
-def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
-    """
-    embed_dim: output dimension for each position
-    pos: a list of positions to be encoded: size (M,)
-    out: (M, D)
-    """
-    assert embed_dim % 2 == 0
-    omega = np.arange(embed_dim // 2, dtype=np.float64)
-    omega /= embed_dim / 2.
-    omega = 1. / 10000 ** omega  # (D/2,)
-
-    pos = pos.reshape(-1)  # (M,)
-    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
-
-    emb_sin = np.sin(out)  # (M, D/2)
-    emb_cos = np.cos(out)  # (M, D/2)
-
-    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
-    return emb
-
-def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
-    assert embed_dim % 2 == 0
-
-    # use half of dimensions to encode grid_h
-    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
-    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
-
-    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
-    return emb
-
-def _ntuple(n):
-    def parse(x):
-        if isinstance(x, Iterable) and not isinstance(x, str):
-            return x
-        return tuple(repeat(x, n))
-    return parse
-
-to_1tuple = _ntuple(1)
-to_2tuple = _ntuple(2)
-
-def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0, pe_interpolation=1.0, base_size=16):
-    """
-    grid_size: int of the grid height and width
-    return:
-    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
-    """
-    if isinstance(grid_size, int):
-        grid_size = to_2tuple(grid_size)
-    grid_h = np.arange(grid_size[0], dtype=np.float32) / (grid_size[0]/base_size) / pe_interpolation
-    grid_w = np.arange(grid_size[1], dtype=np.float32) / (grid_size[1]/base_size) / pe_interpolation
-    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
-    grid = np.stack(grid, axis=0)
-    grid = grid.reshape([2, 1, grid_size[1], grid_size[0]])
-
-    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
-    if cls_token and extra_tokens > 0:
-        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
-    return pos_embed
\ No newline at end of file
+        return x
\ No newline at end of file
diff --git a/craftsman/models/geometry/__pycache__/__init__.cpython-310.pyc b/craftsman/models/geometry/__pycache__/__init__.cpython-310.pyc
index 5e8a39cd825569e0bbe4ba22f7241f6cbedc7069..97ec08e72b9bc4b93582d20c4790036fd0dd42ce 100644
Binary files a/craftsman/models/geometry/__pycache__/__init__.cpython-310.pyc and b/craftsman/models/geometry/__pycache__/__init__.cpython-310.pyc differ
diff --git a/craftsman/models/geometry/__pycache__/base.cpython-310.pyc b/craftsman/models/geometry/__pycache__/base.cpython-310.pyc
index 54b91b362964b3f4242d48775f76166c08e6f05f..001496e810e820ce5e93dcbc470a3e43cacda8f6 100644
Binary files a/craftsman/models/geometry/__pycache__/base.cpython-310.pyc and b/craftsman/models/geometry/__pycache__/base.cpython-310.pyc differ
diff --git a/craftsman/models/geometry/__pycache__/utils.cpython-310.pyc b/craftsman/models/geometry/__pycache__/utils.cpython-310.pyc
index 759c2dac939bd6029335df19094e417576e369e9..74b1e26189f3c644f9a597d481406a61c116895e 100644
Binary files a/craftsman/models/geometry/__pycache__/utils.cpython-310.pyc and b/craftsman/models/geometry/__pycache__/utils.cpython-310.pyc differ
diff --git a/craftsman/models/transformers/__pycache__/attention.cpython-310.pyc b/craftsman/models/transformers/__pycache__/attention.cpython-310.pyc
index 86aaa2ab10591ad95b7931524ed4ced417f7c8ac..a895397929ea800378c5387035a121236bf9567a 100644
Binary files a/craftsman/models/transformers/__pycache__/attention.cpython-310.pyc and b/craftsman/models/transformers/__pycache__/attention.cpython-310.pyc differ
diff --git a/craftsman/models/transformers/__pycache__/perceiver_1d.cpython-310.pyc b/craftsman/models/transformers/__pycache__/perceiver_1d.cpython-310.pyc
index 7b0fc923ea0ee99e65f6e38b6a3e98df4ba5dd08..06bc047d40e34f7868fbec57464ff80b5030db62 100644
Binary files a/craftsman/models/transformers/__pycache__/perceiver_1d.cpython-310.pyc and b/craftsman/models/transformers/__pycache__/perceiver_1d.cpython-310.pyc differ
diff --git a/craftsman/models/transformers/__pycache__/utils.cpython-310.pyc b/craftsman/models/transformers/__pycache__/utils.cpython-310.pyc
index ba2de6a0419f045de38bb33d2f42ef89a29a6fa1..f21f6acb7b0ac748ee2914a5cce09bef3228072a 100644
Binary files a/craftsman/models/transformers/__pycache__/utils.cpython-310.pyc and b/craftsman/models/transformers/__pycache__/utils.cpython-310.pyc differ
diff --git a/craftsman/models/transformers/attention.py b/craftsman/models/transformers/attention.py
old mode 100644
new mode 100755
index 81d179cf4e070c7a0520567b5e64dd3d7ed03792..e4623355b6d1d4d3d2c5c8827d23f8ecf4f0b0c6
--- a/craftsman/models/transformers/attention.py
+++ b/craftsman/models/transformers/attention.py
@@ -9,126 +9,6 @@ from craftsman.utils.checkpoint import checkpoint
 from .utils import init_linear, MLP
 from timm.models.vision_transformer import Attention
 
-def scaled_dot_product_gqa(
-    query: Tensor,
-    key: Tensor,
-    value: Tensor,
-    dropout: float = 0.0,
-    scale: Optional[float] = None,
-    mask: Optional[Tensor] = None,
-    is_causal: Optional[bool] = None,
-    need_weights: bool = False,
-    average_attn_weights: bool = False,
-    force_grouped: bool = False,
-):
-    """Scaled dot product attention with support for grouped queries.
-
-    Einstein notation:
-    - b: batch size
-    - n / s: sequence length
-    - h: number of heads
-    - g: number of groups
-    - d: dimension of query/key/value
-
-    Args:
-        query: Query tensor of shape (b, n, h, d)
-        key: Key tensor of shape (b, s, h, d)
-        value: Value tensor of shape (b, s, h, d)
-        dropout: Dropout probability (default: 0.0)
-        scale: Scale factor for query (default: d_query ** 0.5)
-        mask: Mask tensor of shape (b, n, s) or (b, s). If 'ndim == 2', the mask is
-            applied to all 'n' rows of the attention matrix. (default: None)
-        force_grouped: If True, apply grouped-query attention even if the number of
-            heads is equal for query, key, and value. (default: False)
-
-    Returns:
-        2-tuple of:
-        - Attention output with shape (b, n, h, d)
-        - (Optional) Attention weights with shape (b, h, n, s). Only returned if
-          'need_weights' is True.
-    """
-    if (mask is not None) and (is_causal is not None):
-        raise ValueError(
-            "Only one of 'mask' and 'is_causal' should be provided, but got both."
-        )
-    elif not query.ndim == key.ndim == value.ndim == 4:
-        raise ValueError(
-            f"Expected query, key, and value to be 4-dimensional, but got shapes "
-            f"{query.shape}, {key.shape}, and {value.shape}."
-        )
-
-    # Move sequence length dimension to axis 2.
-    # This makes the attention operations below *much* faster.
-    query = rearrange(query, "b n h d -> b h n d")
-    key = rearrange(key, "b s h d -> b h s d")
-    value = rearrange(value, "b s h d -> b h s d")
-
-    bq, hq, nq, dq = query.shape
-    bk, hk, nk, dk = key.shape
-    bv, hv, nv, dv = value.shape
-    if not (bq == bk == bv and dq == dk == dv):
-        raise ValueError(
-            "Expected query, key, and value to have the same batch size (dim=0) and "
-            f"embedding dimension (dim=3), but got query: {query.shape}, "
-            f"key: {key.shape}, and value: {value.shape}."
-        )
-    elif (hk != hv) or (nk != nv):
-        raise ValueError(
-            "Expected key and value to have the same size in dimensions 1 and 2, but "
-            f"got key: {key.shape} and value: {value.shape}."
-        )
-    elif hq % hk != 0:
-        raise ValueError(
-            "Expected query heads to be a multiple of key/value heads, but got "
-            f"query: {query.shape} and key/value: {key.shape}."
-        )
-
-    if scale is None:
-        scale = query.size(-1) ** 0.5
-    query = query / scale
-
-    num_head_groups = hq // hk
-    query = rearrange(query, "b (h g) n d -> b g h n d", g=num_head_groups)
-    similarity = einsum(query, key, "b g h n d, b h s d -> b g h n s")
-
-    if is_causal:
-        # Mask out the upper triangular portion of the attention matrix. This prevents
-        # the model from attending to tokens in the future.
-        mask = torch.ones((bq, nq, nk), device=query.device, dtype=torch.bool).tril_()
-
-    if mask is not None:
-        # Expand mask to match the shape of the attention matrix.
-        # If mask is 2D, assume that it is applied to the key/value sequence dimension.
-        # Else if mask is 3D, assume that it is applied to the query/key/value sequence
-        # dimension for all attention heads.
-        #
-        if mask.ndim == 2:
-            mask = rearrange(mask, "b s -> b () () () s")
-        elif mask.ndim == 3:
-            mask = rearrange(mask, "b n s -> b () () n s")
-        # Mask similarity values by setting them to negative infinity.  This guarantees
-        # that they will not contribute to the softmax computation below.
-        similarity.masked_fill_(~mask, torch.finfo(similarity.dtype).min)
-
-    attention = F.softmax(similarity, dim=-1)
-    if dropout > 0.0:
-        attention = F.dropout(attention, p=dropout)
-
-    # Apply attention matrix to the value Tensor.
-    out = einsum(attention, value, "b g h n s, b h s d -> b g h n d")
-    # Move head dimension back to axis 2
-    out = rearrange(out, "b g h n d -> b n (h g) d")
-
-    attn_weights: Optional[Tensor] = None
-    if need_weights:
-        # Move the sequence dimensions back to positions 1, 2.  Move the head dimension
-        # to position 3.  This more closely matches the return shape of the attention
-        # output: (b, n, h, d).
-        attn_weights = rearrange(attention, "b g h n s -> b n s (h g)")
-        if average_attn_weights:
-            attn_weights = attn_weights.mean(dim=1)
-
-    return out, attn_weights
 
 class MultiheadAttention(nn.Module):
     def __init__(
@@ -327,4 +207,4 @@ class ResidualCrossAttentionBlock(nn.Module):
     def forward(self, x: torch.Tensor, data: torch.Tensor):
         x = x + self.attn(self.ln_1(x), self.ln_2(data))
         x = x + self.mlp(self.ln_3(x))
-        return x
\ No newline at end of file
+        return x
diff --git a/craftsman/models/transformers/perceiver_1d.py b/craftsman/models/transformers/perceiver_1d.py
old mode 100644
new mode 100755
diff --git a/craftsman/models/transformers/utils.py b/craftsman/models/transformers/utils.py
old mode 100644
new mode 100755
diff --git a/craftsman/pipeline.py b/craftsman/pipeline.py
old mode 100644
new mode 100755
index 15edafd62330df4c7d3d282ab09bcd228a6353b4..a0d62fe0990761f59dc4e7f1fe79ddea4bf7c04d
--- a/craftsman/pipeline.py
+++ b/craftsman/pipeline.py
@@ -158,6 +158,7 @@ class CraftsManPipeline():
         background_color: List[int] = [255, 255, 255],
         foreground_ratio: float = 0.95,
         mc_depth: int = 8,
+        only_max_component: bool = False,
     ):
         r"""
         Function invoked when calling the pipeline for generation.
@@ -198,6 +199,9 @@ class CraftsManPipeline():
             mc_depth (`int`, *optional*, defaults to 8):
                 The resolution of the Marching Cubes algorithm. The resolution is the number of cubes in the x, y, and z.
                 8 means 2^8 = 256 cubes in each dimension. The higher the resolution, the more detailed the mesh will be.
+            only_max_component (`bool`, *optional*, defaults to `False`):
+                Whether to only keep the largest connected component of the mesh. This is useful when the mesh has
+                multiple components and only the largest one is needed.
         Examples:
 
         Returns:
@@ -258,6 +262,15 @@ class CraftsManPipeline():
                 if output_type == "trimesh":
                     import trimesh
                     cur_mesh = trimesh.Trimesh(vertices=mesh_v_f[0][0], faces=mesh_v_f[0][1])
+                    if only_max_component:
+                        components = cur_mesh.split(only_watertight=False)
+                        bbox = []
+                        for c in components:
+                            bbmin = c.vertices.min(0)
+                            bbmax = c.vertices.max(0)
+                            bbox.append((bbmax - bbmin).max())
+                        max_component = np.argmax(bbox)
+                        cur_mesh = components[max_component]
                     mesh.append(cur_mesh)
                 elif output_type == "np":
                     mesh.append(mesh_v_f[0])
diff --git a/craftsman/systems/__pycache__/__init__.cpython-310.pyc b/craftsman/systems/__pycache__/__init__.cpython-310.pyc
index a33a8274329244348289bbfd7de12c4a6533a0d1..275a97fa604c9525eb8a9aeb5cc7f70394a6f088 100644
Binary files a/craftsman/systems/__pycache__/__init__.cpython-310.pyc and b/craftsman/systems/__pycache__/__init__.cpython-310.pyc differ
diff --git a/craftsman/systems/__pycache__/base.cpython-310.pyc b/craftsman/systems/__pycache__/base.cpython-310.pyc
index 368251a7d4d3b344031271b791f22c803dbea049..22227831b79934f817e5104d93855117bd837fe5 100644
Binary files a/craftsman/systems/__pycache__/base.cpython-310.pyc and b/craftsman/systems/__pycache__/base.cpython-310.pyc differ
diff --git a/craftsman/systems/__pycache__/pixart_diffusion.cpython-310.pyc b/craftsman/systems/__pycache__/pixart_diffusion.cpython-310.pyc
index 3dc2e3115a0738aa6d00e90c50b83cae66274e60..355b9b8ad90cef4e78e8a19d0489ed73aced96e0 100644
Binary files a/craftsman/systems/__pycache__/pixart_diffusion.cpython-310.pyc and b/craftsman/systems/__pycache__/pixart_diffusion.cpython-310.pyc differ
diff --git a/craftsman/systems/__pycache__/shape_autoencoder.cpython-310.pyc b/craftsman/systems/__pycache__/shape_autoencoder.cpython-310.pyc
index 12bf192a5b35186b6aa033695f0ea61c3272a554..71d14323b7666b642eb2b2a867484678f42eb5d4 100644
Binary files a/craftsman/systems/__pycache__/shape_autoencoder.cpython-310.pyc and b/craftsman/systems/__pycache__/shape_autoencoder.cpython-310.pyc differ
diff --git a/craftsman/systems/__pycache__/utils.cpython-310.pyc b/craftsman/systems/__pycache__/utils.cpython-310.pyc
old mode 100644
new mode 100755
diff --git a/craftsman/systems/pixart_diffusion.py b/craftsman/systems/pixart_diffusion.py
old mode 100644
new mode 100755
index 529167af2542f7d857defa39074e8047817b0b8c..93466315855a0b1b93d44e8977eeeda103f95fe5
--- a/craftsman/systems/pixart_diffusion.py
+++ b/craftsman/systems/pixart_diffusion.py
@@ -251,9 +251,9 @@ class PixArtDiffusionSystem(BaseSystem):
         return {
             "loss_diffusion": loss,
             "latents": latents,
-            "x_t": x_t,
+            "x_t": noisy_z,
             "noise": noise,
-            "noise_pred": pred_noise,
+            "noise_pred": noise_pred,
             "timesteps": timesteps,
             }
 
@@ -373,4 +373,4 @@ class PixArtDiffusionSystem(BaseSystem):
         return outputs
 
     def on_validation_epoch_end(self):
-        pass
\ No newline at end of file
+        pass
diff --git a/craftsman/utils/__pycache__/__init__.cpython-310.pyc b/craftsman/utils/__pycache__/__init__.cpython-310.pyc
index b9c8c4a21e1ddb79aab68a042d8c41e41bae9ad4..dc8fc56aa251450661270183a09d2d2ef7ce4e3c 100644
Binary files a/craftsman/utils/__pycache__/__init__.cpython-310.pyc and b/craftsman/utils/__pycache__/__init__.cpython-310.pyc differ
diff --git a/craftsman/utils/__pycache__/__init__.cpython-311.pyc b/craftsman/utils/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d176e286f072ec0c415f0db0f8c81b0967893b89
Binary files /dev/null and b/craftsman/utils/__pycache__/__init__.cpython-311.pyc differ
diff --git a/craftsman/utils/__pycache__/base.cpython-310.pyc b/craftsman/utils/__pycache__/base.cpython-310.pyc
index 83a68795573ff8940846efa2c7d038d175d73b83..853925fb34f1b0e3cf043bcaaae367591f543ccd 100644
Binary files a/craftsman/utils/__pycache__/base.cpython-310.pyc and b/craftsman/utils/__pycache__/base.cpython-310.pyc differ
diff --git a/craftsman/utils/__pycache__/base.cpython-311.pyc b/craftsman/utils/__pycache__/base.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..83f500bde0064775718572f0abc99760debdb29f
Binary files /dev/null and b/craftsman/utils/__pycache__/base.cpython-311.pyc differ
diff --git a/craftsman/utils/__pycache__/checkpoint.cpython-310.pyc b/craftsman/utils/__pycache__/checkpoint.cpython-310.pyc
index bd5b481c93c34571f33c0fc48ef145ed23b1fa58..fed9e95b7e88f3a21a4f3816a7f2ae36b5a1b295 100644
Binary files a/craftsman/utils/__pycache__/checkpoint.cpython-310.pyc and b/craftsman/utils/__pycache__/checkpoint.cpython-310.pyc differ
diff --git a/craftsman/utils/__pycache__/config.cpython-310.pyc b/craftsman/utils/__pycache__/config.cpython-310.pyc
index aac3aef5468c180268a0c8c52a1707e60d870822..45270df949585d3d479f430499777ffa86b42f68 100644
Binary files a/craftsman/utils/__pycache__/config.cpython-310.pyc and b/craftsman/utils/__pycache__/config.cpython-310.pyc differ
diff --git a/craftsman/utils/__pycache__/config.cpython-311.pyc b/craftsman/utils/__pycache__/config.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f70d9dc896ed0327846e4140fc2261f359ad954c
Binary files /dev/null and b/craftsman/utils/__pycache__/config.cpython-311.pyc differ
diff --git a/craftsman/utils/__pycache__/misc.cpython-310.pyc b/craftsman/utils/__pycache__/misc.cpython-310.pyc
index 3adf88e303ffbe6e877b42cbadc7f326fcbad751..b6c11bf8560b424fcd6f45d5da9202ddcde69eae 100644
Binary files a/craftsman/utils/__pycache__/misc.cpython-310.pyc and b/craftsman/utils/__pycache__/misc.cpython-310.pyc differ
diff --git a/craftsman/utils/__pycache__/ops.cpython-310.pyc b/craftsman/utils/__pycache__/ops.cpython-310.pyc
index debfa41eae014018d9d0ba540248b2f77de3c23b..e8bdd120d0dc5c632d918740c6eeab3e0fb99f68 100644
Binary files a/craftsman/utils/__pycache__/ops.cpython-310.pyc and b/craftsman/utils/__pycache__/ops.cpython-310.pyc differ
diff --git a/craftsman/utils/__pycache__/saving.cpython-310.pyc b/craftsman/utils/__pycache__/saving.cpython-310.pyc
index 4b38d835c68dc5a67dad09beea97d2bb2f47fcf4..a8fa8a52d8589ce573de44a75b1ebe40fdf99c9f 100644
Binary files a/craftsman/utils/__pycache__/saving.cpython-310.pyc and b/craftsman/utils/__pycache__/saving.cpython-310.pyc differ
diff --git a/craftsman/utils/__pycache__/scheduler.cpython-310.pyc b/craftsman/utils/__pycache__/scheduler.cpython-310.pyc
index d7f9b4184fd39cf6b2df5076a97681f89ea577fe..7f65835c4ecd6f9233a169a24e5e14184a122ca4 100644
Binary files a/craftsman/utils/__pycache__/scheduler.cpython-310.pyc and b/craftsman/utils/__pycache__/scheduler.cpython-310.pyc differ
diff --git a/craftsman/utils/__pycache__/typing.cpython-310.pyc b/craftsman/utils/__pycache__/typing.cpython-310.pyc
index 24970b9142feaba6029154274a042c4c34fc6e92..79b8dae01c4afaf63968f2bb64582ba8204f29a1 100644
Binary files a/craftsman/utils/__pycache__/typing.cpython-310.pyc and b/craftsman/utils/__pycache__/typing.cpython-310.pyc differ
diff --git a/craftsman/utils/__pycache__/typing.cpython-311.pyc b/craftsman/utils/__pycache__/typing.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..448acbdfe51e93efe4f6efbc5fa514db87a3902c
Binary files /dev/null and b/craftsman/utils/__pycache__/typing.cpython-311.pyc differ
diff --git a/docker/requirements.txt b/docker/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..297f4a1b96ee50046dba8c079bdf3165c1f66728
--- /dev/null
+++ b/docker/requirements.txt
@@ -0,0 +1,34 @@
+datasets==2.19.0
+diffusers==0.31.0
+einops==0.8.0
+huggingface-hub==0.26.2
+imageio==2.34.1
+jaxtyping==0.2.28
+joblib==1.4.0
+lightning-utilities==0.11.2
+matplotlib==3.8.4
+numpy==1.26.4
+omegaconf==2.3.0
+opencv-python==4.9.0.80
+pandas==2.2.2
+pillow==10.3.0
+plyfile==1.0.3
+PyMCubes==0.1.4
+pyparsing==3.1.2
+pytorch-lightning==2.2.4
+PyYAML==6.0.1
+safetensors==0.4.3
+scikit-image==0.23.2
+scipy==1.13.0
+tensorboard==2.16.2
+tensorboardX==2.6.2.2
+timm==0.9.16
+tokenizers==0.19.1
+tqdm==4.66.2
+transformers==4.40.1
+trimesh==4.3.2
+spaces==0.28.3
+accelerate==0.29.1
+rembg==2.0.59
+gradio==5.5.0
+wandb==0.18.6
\ No newline at end of file
diff --git a/server.py b/server.py
new file mode 100644
index 0000000000000000000000000000000000000000..614a21e19baac9991c6a16871cb0cb7f340797eb
--- /dev/null
+++ b/server.py
@@ -0,0 +1,98 @@
+import argparse
+import base64
+import os
+from datetime import datetime
+import traceback
+import torch
+import trimesh
+from craftsman import CraftsManPipeline
+
+CURRENT_DIR = f'/tmp/native3d_server/{os.getpid()}'
+os.makedirs(CURRENT_DIR, exist_ok=True)
+
+def parse_parameters():
+    parser = argparse.ArgumentParser("native3d")
+    parser.add_argument('--host', default="0.0.0.0", type=str)
+    parser.add_argument('--port', default=80, type=int)
+    return parser.parse_args()
+    
+# -------------------- fastapi --------------------
+from typing import Optional
+from pydantic import BaseModel, Field
+
+class Native3DRequestV1(BaseModel):
+    image_path: str  # input image path
+    mesh_path: str  # output mesh path, support glb or obj in clean dir
+
+class Native3DResponseV1(BaseModel):
+    pass
+
+class Native3DRequestV2(BaseModel):
+    image_bytes: str  # input image bytes(base64)
+    mesh_type: str  # output mesh type, support glb or obj
+
+class Native3DResponseV2(BaseModel):
+    mesh_bytes: str  # output mesh bytes(base64)
+
+if __name__=="__main__":
+    parse_args = parse_parameters()
+
+    # prepare models
+    pipeline = CraftsManPipeline.from_pretrained("ckpts/craftsman-v1-5", device="cuda:0", torch_dtype=torch.float32)
+
+    # -------------------- fastapi --------------------
+    from fastapi import FastAPI, Request
+    import requests
+    app = FastAPI()
+
+    @app.post("/native3d_v1", response_model=Native3DResponseV1)
+    async def native3d(request: Request, image_to_mesh_request: Native3DRequestV1):
+        try:
+            print(f"image_to_mesh_request = {image_to_mesh_request}")
+            mesh = pipeline(image_to_mesh_request.image_path).meshes[0]
+            os.makedirs(os.path.dirname(os.path.abspath(image_to_mesh_request.mesh_path)), exist_ok=True)
+            mesh.export(image_to_mesh_request.mesh_path)
+        except Exception as e:
+            traceback.print_exc()
+            print(f"generate_model error: {e}")
+        return Native3DResponseV1()
+
+    @app.post("/native3d_v2", response_model=Native3DResponseV2)
+    async def native3d(request: Request, image_to_mesh_request: Native3DRequestV2):
+        try:
+            # print(f"image_to_mesh_request = {image_to_mesh_request}")
+            mesh_type = image_to_mesh_request.mesh_type
+            assert mesh_type in ['obj', 'glb']
+            task_id = datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f') + '-' + 'native3d'
+            current_dir = os.path.join(CURRENT_DIR, task_id)
+            os.makedirs(current_dir, exist_ok=True)
+            image_path = os.path.join(current_dir, 'input_image.png')
+            with open(image_path, 'wb') as f:
+                f.write(base64.b64decode(image_to_mesh_request.image_bytes))
+            mesh_path = os.path.join(current_dir, f'output_mesh.{mesh_type}')
+            import time
+            start = time.time()
+            # mesh = pipeline(image_path, mc_depth=8, num_inference_steps=25).meshes[0]
+            # mesh = pipeline(image_path, mc_depth=7, num_inference_steps=25).meshes[0]
+            mesh = pipeline(image_path, mc_depth=7, num_inference_steps=50).meshes[0]
+            print(f"Time: {time.time() - start}s")
+            os.makedirs(os.path.dirname(os.path.abspath(mesh_path)), exist_ok=True)
+            mesh.visual = trimesh.visual.TextureVisuals(
+                material=trimesh.visual.material.PBRMaterial(
+                    baseColorFactor=(255, 255, 255), main_color=(255, 255, 255), metallicFactor=0.05, roughnessFactor=1.0
+                )
+            )
+            mesh.export(mesh_path)
+            with open(mesh_path, 'rb') as f:
+                mesh_bytes = f.read()
+        except Exception as e:
+            traceback.print_exc()
+            print(f"generate_model error: {e}")
+        return Native3DResponseV2(mesh_bytes=base64.b64encode(mesh_bytes).decode('utf-8'))
+
+    @app.get("/health")
+    async def health():
+        return {"status": "OK"}
+
+    import uvicorn
+    uvicorn.run(app, host=parse_args.host, port=parse_args.port)