isaiahbjork commited on
Commit
4ed6f02
·
verified ·
1 Parent(s): 61d892b

Upload folder using huggingface_hub

Browse files
added_tokens copy.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|box_end|>": 151649,
3
+ "<|box_start|>": 151648,
4
+ "<|endoftext|>": 151643,
5
+ "<|im_end|>": 151645,
6
+ "<|im_start|>": 151644,
7
+ "<|image_pad|>": 151655,
8
+ "<|object_ref_end|>": 151647,
9
+ "<|object_ref_start|>": 151646,
10
+ "<|quad_end|>": 151651,
11
+ "<|quad_start|>": 151650,
12
+ "<|video_pad|>": 151656,
13
+ "<|vision_end|>": 151653,
14
+ "<|vision_pad|>": 151654,
15
+ "<|vision_start|>": 151652
16
+ }
added_tokens.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|box_end|>": 151649,
3
+ "<|box_start|>": 151648,
4
+ "<|endoftext|>": 151643,
5
+ "<|im_end|>": 151645,
6
+ "<|im_start|>": 151644,
7
+ "<|image_pad|>": 151655,
8
+ "<|object_ref_end|>": 151647,
9
+ "<|object_ref_start|>": 151646,
10
+ "<|quad_end|>": 151651,
11
+ "<|quad_start|>": 151650,
12
+ "<|video_pad|>": 151656,
13
+ "<|vision_end|>": 151653,
14
+ "<|vision_pad|>": 151654,
15
+ "<|vision_start|>": 151652
16
+ }
chat_template.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
3
+ }
config.json CHANGED
@@ -1,352 +1,48 @@
1
  {
2
- "_attn_implementation_autoset": true,
3
- "add_cross_attention": false,
4
- "architectures": [
5
- "Qwen2VLForConditionalGeneration"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  ],
7
- "attention_dropout": 0.0,
8
- "bad_words_ids": null,
9
- "begin_suppress_tokens": null,
10
- "bos_token_id": 151643,
11
- "chunk_size_feed_forward": 0,
12
- "cross_attention_hidden_size": null,
13
- "decoder_start_token_id": null,
14
- "diversity_penalty": 0.0,
15
- "do_sample": false,
16
- "early_stopping": false,
17
- "encoder_no_repeat_ngram_size": 0,
18
- "eos_token_id": 151645,
19
- "exponential_decay_length_penalty": null,
20
- "finetuning_task": null,
21
- "forced_bos_token_id": null,
22
- "forced_eos_token_id": null,
23
- "hidden_act": "silu",
24
  "hidden_size": 1536,
25
- "id2label": {
26
- "0": "LABEL_0",
27
- "1": "LABEL_1"
28
- },
29
- "image_token_id": 151655,
30
- "initializer_range": 0.02,
31
- "intermediate_size": 8960,
32
- "is_decoder": false,
33
- "is_encoder_decoder": false,
34
- "label2id": {
35
- "LABEL_0": 0,
36
- "LABEL_1": 1
37
- },
38
- "length_penalty": 1.0,
39
- "max_length": 20,
40
- "max_position_embeddings": 32768,
41
- "max_window_layers": 28,
42
- "min_length": 0,
43
  "model_type": "qwen2_vl",
44
- "no_repeat_ngram_size": 0,
45
- "num_attention_heads": 12,
46
- "num_beam_groups": 1,
47
- "num_beams": 1,
48
- "num_hidden_layers": 28,
49
- "num_key_value_heads": 2,
50
- "num_return_sequences": 1,
51
- "output_attentions": false,
52
- "output_hidden_states": false,
53
- "output_scores": false,
54
- "pad_token_id": null,
55
- "prefix": null,
56
- "problem_type": null,
57
- "pruned_heads": {},
58
- "quantization": {
59
- "group_size": 64,
60
- "bits": 4
61
- },
62
- "remove_invalid_values": false,
63
- "repetition_penalty": 1.0,
64
- "return_dict": true,
65
- "return_dict_in_generate": false,
66
- "rms_norm_eps": 1e-06,
67
- "rope_scaling": {
68
- "mrope_section": [
69
- 16,
70
- 24,
71
- 24
72
- ],
73
- "type": "default",
74
- "rope_type": "default"
75
- },
76
- "rope_theta": 1000000.0,
77
- "rope_traditional": true,
78
- "sep_token_id": null,
79
- "sliding_window": 32768,
80
- "suppress_tokens": null,
81
- "task_specific_params": null,
82
- "temperature": 1.0,
83
- "text_config": {
84
- "vision_config": {
85
- "return_dict": true,
86
- "output_hidden_states": false,
87
- "output_attentions": false,
88
- "torchscript": false,
89
- "torch_dtype": null,
90
- "use_bfloat16": false,
91
- "tf_legacy_loss": false,
92
- "pruned_heads": {},
93
- "tie_word_embeddings": true,
94
- "chunk_size_feed_forward": 0,
95
- "is_encoder_decoder": false,
96
- "is_decoder": false,
97
- "cross_attention_hidden_size": null,
98
- "add_cross_attention": false,
99
- "tie_encoder_decoder": false,
100
- "max_length": 20,
101
- "min_length": 0,
102
- "do_sample": false,
103
- "early_stopping": false,
104
- "num_beams": 1,
105
- "num_beam_groups": 1,
106
- "diversity_penalty": 0.0,
107
- "temperature": 1.0,
108
- "top_k": 50,
109
- "top_p": 1.0,
110
- "typical_p": 1.0,
111
- "repetition_penalty": 1.0,
112
- "length_penalty": 1.0,
113
- "no_repeat_ngram_size": 0,
114
- "encoder_no_repeat_ngram_size": 0,
115
- "bad_words_ids": null,
116
- "num_return_sequences": 1,
117
- "output_scores": false,
118
- "return_dict_in_generate": false,
119
- "forced_bos_token_id": null,
120
- "forced_eos_token_id": null,
121
- "remove_invalid_values": false,
122
- "exponential_decay_length_penalty": null,
123
- "suppress_tokens": null,
124
- "begin_suppress_tokens": null,
125
- "architectures": null,
126
- "finetuning_task": null,
127
- "id2label": {
128
- "0": "LABEL_0",
129
- "1": "LABEL_1"
130
- },
131
- "label2id": {
132
- "LABEL_0": 0,
133
- "LABEL_1": 1
134
- },
135
- "tokenizer_class": null,
136
- "prefix": null,
137
- "bos_token_id": null,
138
- "pad_token_id": null,
139
- "eos_token_id": null,
140
- "sep_token_id": null,
141
- "decoder_start_token_id": null,
142
- "task_specific_params": null,
143
- "problem_type": null,
144
- "_name_or_path": "",
145
- "_attn_implementation_autoset": false,
146
- "in_chans": 3,
147
- "model_type": "qwen2_vl",
148
- "spatial_patch_size": 14,
149
- "depth": 32,
150
- "embed_dim": 1280,
151
- "hidden_size": 1536,
152
- "hidden_act": "quick_gelu",
153
- "mlp_ratio": 4,
154
- "num_heads": 16,
155
- "in_channels": 3,
156
- "patch_size": 14,
157
- "spatial_merge_size": 2,
158
- "temporal_patch_size": 2
159
- },
160
- "vocab_size": 151936,
161
- "max_position_embeddings": 32768,
162
- "hidden_size": 1536,
163
- "intermediate_size": 8960,
164
- "num_hidden_layers": 28,
165
- "num_attention_heads": 12,
166
- "use_sliding_window": false,
167
- "sliding_window": 32768,
168
- "max_window_layers": 28,
169
- "num_key_value_heads": 2,
170
- "hidden_act": "silu",
171
- "initializer_range": 0.02,
172
- "rms_norm_eps": 1e-06,
173
- "use_cache": false,
174
- "rope_theta": 1000000.0,
175
- "attention_dropout": 0.0,
176
- "rope_scaling": {
177
- "mrope_section": [
178
- 16,
179
- 24,
180
- 24
181
- ],
182
- "type": "default",
183
- "rope_type": "default"
184
- },
185
- "return_dict": true,
186
- "output_hidden_states": false,
187
- "output_attentions": false,
188
- "torchscript": false,
189
- "torch_dtype": "bfloat16",
190
- "use_bfloat16": false,
191
- "tf_legacy_loss": false,
192
- "pruned_heads": {},
193
- "tie_word_embeddings": true,
194
- "chunk_size_feed_forward": 0,
195
- "is_encoder_decoder": false,
196
- "is_decoder": false,
197
- "cross_attention_hidden_size": null,
198
- "add_cross_attention": false,
199
- "tie_encoder_decoder": false,
200
- "max_length": 20,
201
- "min_length": 0,
202
- "do_sample": false,
203
- "early_stopping": false,
204
- "num_beams": 1,
205
- "num_beam_groups": 1,
206
- "diversity_penalty": 0.0,
207
- "temperature": 1.0,
208
- "top_k": 50,
209
- "top_p": 1.0,
210
- "typical_p": 1.0,
211
- "repetition_penalty": 1.0,
212
- "length_penalty": 1.0,
213
- "no_repeat_ngram_size": 0,
214
- "encoder_no_repeat_ngram_size": 0,
215
- "bad_words_ids": null,
216
- "num_return_sequences": 1,
217
- "output_scores": false,
218
- "return_dict_in_generate": false,
219
- "forced_bos_token_id": null,
220
- "forced_eos_token_id": null,
221
- "remove_invalid_values": false,
222
- "exponential_decay_length_penalty": null,
223
- "suppress_tokens": null,
224
- "begin_suppress_tokens": null,
225
- "architectures": [
226
- "Qwen2VLForConditionalGeneration"
227
- ],
228
- "finetuning_task": null,
229
- "id2label": {
230
- "0": "LABEL_0",
231
- "1": "LABEL_1"
232
- },
233
- "label2id": {
234
- "LABEL_0": 0,
235
- "LABEL_1": 1
236
- },
237
- "tokenizer_class": null,
238
- "prefix": null,
239
- "bos_token_id": 151643,
240
- "pad_token_id": null,
241
- "eos_token_id": 151645,
242
- "sep_token_id": null,
243
- "decoder_start_token_id": null,
244
- "task_specific_params": null,
245
- "problem_type": null,
246
- "_name_or_path": "ShiniShiho/ShowUI-2B",
247
- "_attn_implementation_autoset": true,
248
- "image_token_id": 151655,
249
- "model_type": "qwen2_vl",
250
- "tokenizer_model_max_length": 4096,
251
- "video_token_id": 151656,
252
- "vision_end_token_id": 151653,
253
- "vision_start_token_id": 151652,
254
- "vision_token_id": 151654
255
- },
256
- "tf_legacy_loss": false,
257
- "tie_encoder_decoder": false,
258
- "tie_word_embeddings": true,
259
- "tokenizer_class": null,
260
- "tokenizer_model_max_length": 4096,
261
- "top_k": 50,
262
- "top_p": 1.0,
263
- "torch_dtype": "bfloat16",
264
- "torchscript": false,
265
- "transformers_version": "4.48.3",
266
- "typical_p": 1.0,
267
- "use_bfloat16": false,
268
- "use_cache": false,
269
- "use_sliding_window": false,
270
- "video_token_id": 151656,
271
- "vision_config": {
272
- "return_dict": true,
273
- "output_hidden_states": false,
274
- "output_attentions": false,
275
- "torchscript": false,
276
- "torch_dtype": null,
277
- "use_bfloat16": false,
278
- "tf_legacy_loss": false,
279
- "pruned_heads": {},
280
- "tie_word_embeddings": true,
281
- "chunk_size_feed_forward": 0,
282
- "is_encoder_decoder": false,
283
- "is_decoder": false,
284
- "cross_attention_hidden_size": null,
285
- "add_cross_attention": false,
286
- "tie_encoder_decoder": false,
287
- "max_length": 20,
288
- "min_length": 0,
289
- "do_sample": false,
290
- "early_stopping": false,
291
- "num_beams": 1,
292
- "num_beam_groups": 1,
293
- "diversity_penalty": 0.0,
294
- "temperature": 1.0,
295
- "top_k": 50,
296
- "top_p": 1.0,
297
- "typical_p": 1.0,
298
- "repetition_penalty": 1.0,
299
- "length_penalty": 1.0,
300
- "no_repeat_ngram_size": 0,
301
- "encoder_no_repeat_ngram_size": 0,
302
- "bad_words_ids": null,
303
- "num_return_sequences": 1,
304
- "output_scores": false,
305
- "return_dict_in_generate": false,
306
- "forced_bos_token_id": null,
307
- "forced_eos_token_id": null,
308
- "remove_invalid_values": false,
309
- "exponential_decay_length_penalty": null,
310
- "suppress_tokens": null,
311
- "begin_suppress_tokens": null,
312
- "architectures": null,
313
- "finetuning_task": null,
314
- "id2label": {
315
- "0": "LABEL_0",
316
- "1": "LABEL_1"
317
- },
318
- "label2id": {
319
- "LABEL_0": 0,
320
- "LABEL_1": 1
321
- },
322
- "tokenizer_class": null,
323
- "prefix": null,
324
- "bos_token_id": null,
325
- "pad_token_id": null,
326
- "eos_token_id": null,
327
- "sep_token_id": null,
328
- "decoder_start_token_id": null,
329
- "task_specific_params": null,
330
- "problem_type": null,
331
- "_name_or_path": "",
332
- "_attn_implementation_autoset": false,
333
- "transformers_version": "4.48.3",
334
- "in_chans": 3,
335
- "model_type": "qwen2_vl",
336
- "spatial_patch_size": 14,
337
- "depth": 32,
338
- "embed_dim": 1280,
339
- "hidden_size": 1536,
340
- "hidden_act": "quick_gelu",
341
- "mlp_ratio": 4,
342
- "num_heads": 16,
343
- "in_channels": 3,
344
- "patch_size": 14,
345
- "spatial_merge_size": 2,
346
- "temporal_patch_size": 2
347
- },
348
- "vision_end_token_id": 151653,
349
- "vision_start_token_id": 151652,
350
- "vision_token_id": 151654,
351
- "vocab_size": 151936
352
- }
 
1
  {
2
+ "_name_or_path": "Qwen/Qwen2-VL-2B-Instruct",
3
+ "architectures": [
4
+ "Qwen2VLForConditionalGeneration"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151645,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 1536,
11
+ "image_token_id": 151655,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 8960,
14
+ "max_position_embeddings": 32768,
15
+ "max_window_layers": 28,
16
+ "model_type": "qwen2_vl",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 28,
19
+ "num_key_value_heads": 2,
20
+ "rms_norm_eps": 1e-06,
21
+ "rope_scaling": {
22
+ "mrope_section": [
23
+ 16,
24
+ 24,
25
+ 24
26
  ],
27
+ "type": "mrope"
28
+ },
29
+ "rope_theta": 1000000.0,
30
+ "sliding_window": 32768,
31
+ "tie_word_embeddings": true,
32
+ "tokenizer_model_max_length": 4096,
33
+ "torch_dtype": "bfloat16",
34
+ "transformers_version": "4.45.0.dev0",
35
+ "use_cache": false,
36
+ "use_sliding_window": false,
37
+ "video_token_id": 151656,
38
+ "vision_config": {
 
 
 
 
 
39
  "hidden_size": 1536,
40
+ "in_chans": 3,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  "model_type": "qwen2_vl",
42
+ "spatial_patch_size": 14
43
+ },
44
+ "vision_end_token_id": 151653,
45
+ "vision_start_token_id": 151652,
46
+ "vision_token_id": 151654,
47
+ "vocab_size": 151936
48
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0715fbe4bee2611c2625e07c32e14c228bf81b416ee04c92df1ee2c0e4946253
3
- size 1388178008
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5ac71ee7fda50033f79a375f26bd39f6e12f4595df830c33182658c78a5ed6f
3
+ size 1543855784