abetlen commited on
Commit
de95f39
1 Parent(s): 7a99dd5

Upload convert-hf-vision-to-gguf.py

Browse files
Files changed (1) hide show
  1. convert-hf-vision-to-gguf.py +182 -0
convert-hf-vision-to-gguf.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+ import numpy as np
4
+ from gguf import *
5
+ from safetensors import safe_open
6
+ from transformers import AutoTokenizer
7
+
8
+
9
+ def k(raw_key: str, arch: str) -> str:
10
+ return raw_key.format(arch=arch)
11
+
12
+
13
+ parser = argparse.ArgumentParser()
14
+ parser.add_argument("--model", type=str, default="nanoLLaVA/model.safetensors")
15
+ parser.add_argument("--tokenizer", type=str, default="nanoLLaVA")
16
+ args = parser.parse_args()
17
+
18
+ tensors = safe_open(args.model, framework="np", device="cpu")
19
+
20
+ ### Vision encoder
21
+
22
+ ftype = 1 # fp16
23
+
24
+ fname_middle = "mmproj-"
25
+ has_text_encoder = False
26
+ has_llava_projector = True
27
+
28
+ fname_out = "nanollava-mmproj-f16.gguf"
29
+ fout = GGUFWriter(fname_out, arch="clip")
30
+
31
+ fout.add_bool("clip.has_text_encoder", False)
32
+ fout.add_bool("clip.has_vision_encoder", True)
33
+ fout.add_bool("clip.has_llava_projector", True)
34
+ fout.add_file_type(ftype) # fp16
35
+
36
+ model_name = "qnguyen3/nanoLLaVA"
37
+ fout.add_name(model_name)
38
+ fout.add_description("image encoder for " + model_name)
39
+ fout.add_string("clip.projector_type", "mlp")
40
+
41
+ # vision model hparams
42
+ VISION = "clip.vision"
43
+ fout.add_uint32("clip.vision.image_size", 378)
44
+ fout.add_uint32("clip.vision.patch_size", 14)
45
+ fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), 1152)
46
+ fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), 4304)
47
+ fout.add_uint32("clip.vision.projection_dim", 2048)
48
+ fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), 16)
49
+ fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
50
+ fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), 27 + 1)
51
+
52
+ fout.add_array("clip.vision.image_mean", [0.5, 0.5, 0.5])
53
+ fout.add_array("clip.vision.image_std", [0.5, 0.5, 0.5])
54
+ fout.add_bool("clip.use_gelu", True) # using regular GELU instead of quick
55
+
56
+ # vision projection
57
+ fout.add_tensor(
58
+ "mm.0.weight",
59
+ tensors.get_tensor("model.mm_projector.0.weight").astype(
60
+ np.float16
61
+ ),
62
+ )
63
+ fout.add_tensor(
64
+ "mm.0.bias",
65
+ tensors.get_tensor("model.mm_projector.0.bias").astype(np.float32),
66
+ )
67
+ fout.add_tensor(
68
+ "mm.2.weight",
69
+ tensors.get_tensor("model.mm_projector.2.weight").astype(
70
+ np.float16
71
+ ),
72
+ )
73
+ fout.add_tensor(
74
+ "mm.2.bias",
75
+ tensors.get_tensor("model.mm_projector.2.bias").astype(np.float32),
76
+ )
77
+
78
+ # encoder (siglip)
79
+ fout.add_tensor(
80
+ "v.position_embd.weight",
81
+ tensors.get_tensor("model.vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight").astype(
82
+ np.float16
83
+ ),
84
+ )
85
+ fout.add_tensor(
86
+ "v.patch_embd.weight",
87
+ tensors.get_tensor(
88
+ "model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight"
89
+ )
90
+ .reshape(1152, 3, 14, 14)
91
+ .astype(np.float16),
92
+ )
93
+ fout.add_tensor(
94
+ "v.patch_embd.bias",
95
+ tensors.get_tensor(
96
+ "model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias"
97
+ ).astype(np.float32),
98
+ )
99
+
100
+ fout.add_tensor(
101
+ "v.post_ln.weight",
102
+ tensors.get_tensor("model.vision_tower.vision_tower.vision_model.post_layernorm.weight").astype(
103
+ np.float32
104
+ ),
105
+ )
106
+ fout.add_tensor(
107
+ "v.post_ln.bias",
108
+ tensors.get_tensor("model.vision_tower.vision_tower.vision_model.post_layernorm.bias").astype(
109
+ np.float32
110
+ ),
111
+ )
112
+
113
+ def blk_tensor(i, name):
114
+ return tensors.get_tensor(
115
+ rf"model.vision_tower.vision_tower.vision_model.encoder.layers.{i}.{name}"
116
+ )
117
+
118
+ def add_tensor(blk_id, gguf_id=None):
119
+ if gguf_id is None:
120
+ gguf_id = blk_id
121
+
122
+ fout.add_tensor(f"v.blk.{gguf_id}.attn_q.weight", blk_tensor(blk_id, "self_attn.q_proj.weight").astype(np.float16))
123
+ fout.add_tensor(f"v.blk.{gguf_id}.attn_q.bias", blk_tensor(blk_id, "self_attn.q_proj.bias").astype(np.float32))
124
+ fout.add_tensor(f"v.blk.{gguf_id}.attn_k.weight", blk_tensor(blk_id, "self_attn.k_proj.weight").astype(np.float16))
125
+ fout.add_tensor(f"v.blk.{gguf_id}.attn_k.bias", blk_tensor(blk_id, "self_attn.k_proj.bias").astype(np.float32))
126
+ fout.add_tensor(f"v.blk.{gguf_id}.attn_v.weight", blk_tensor(blk_id, "self_attn.v_proj.weight").astype(np.float16))
127
+ fout.add_tensor(f"v.blk.{gguf_id}.attn_v.bias", blk_tensor(blk_id, "self_attn.v_proj.bias").astype(np.float32))
128
+
129
+ fout.add_tensor(
130
+ f"v.blk.{gguf_id}.attn_out.weight",
131
+ blk_tensor(blk_id, "self_attn.out_proj.weight").astype(np.float16),
132
+ )
133
+ fout.add_tensor(
134
+ f"v.blk.{gguf_id}.attn_out.bias",
135
+ blk_tensor(blk_id, "self_attn.out_proj.bias").astype(np.float32),
136
+ )
137
+
138
+ fout.add_tensor(
139
+ f"v.blk.{gguf_id}.ln1.weight",
140
+ blk_tensor(blk_id, "layer_norm1.weight").astype(np.float32),
141
+ )
142
+ fout.add_tensor(
143
+ f"v.blk.{gguf_id}.ln1.bias",
144
+ blk_tensor(blk_id, "layer_norm1.bias").astype(np.float32),
145
+ )
146
+
147
+ fout.add_tensor(
148
+ f"v.blk.{gguf_id}.ffn_down.weight",
149
+ blk_tensor(blk_id, "mlp.fc1.weight").astype(np.float16),
150
+ )
151
+ fout.add_tensor(
152
+ f"v.blk.{gguf_id}.ffn_down.bias",
153
+ blk_tensor(blk_id, "mlp.fc1.bias").astype(np.float32),
154
+ )
155
+ fout.add_tensor(
156
+ f"v.blk.{gguf_id}.ffn_up.weight",
157
+ blk_tensor(blk_id, "mlp.fc2.weight").astype(np.float16),
158
+ )
159
+ fout.add_tensor(
160
+ f"v.blk.{gguf_id}.ffn_up.bias",
161
+ blk_tensor(blk_id, "mlp.fc2.bias").astype(np.float32),
162
+ )
163
+
164
+ fout.add_tensor(
165
+ f"v.blk.{gguf_id}.ln2.weight",
166
+ blk_tensor(blk_id, "layer_norm2.weight").astype(np.float32),
167
+ )
168
+ fout.add_tensor(
169
+ f"v.blk.{gguf_id}.ln2.bias",
170
+ blk_tensor(blk_id, "layer_norm2.bias").astype(np.float32),
171
+ )
172
+
173
+ for i in range(27):
174
+ add_tensor(i)
175
+
176
+ # Duplicate the last block (llava-cli skips over this)
177
+ add_tensor(26, 27)
178
+
179
+ fout.write_header_to_file()
180
+ fout.write_kv_data_to_file()
181
+ fout.write_tensors_to_file()
182
+ fout.close()