Hennara commited on
Commit
6c756ca
·
verified ·
1 Parent(s): 846a0cd

Upload KawnIdefics3ForConditionalGeneration

Browse files
config.json CHANGED
@@ -1,72 +1,75 @@
1
  {
2
- "_name_or_path": "kawn_idefics3_siglib14_384",
3
  "architectures": [
4
  "KawnIdefics3ForConditionalGeneration"
5
  ],
6
  "ignore_index": -100,
7
  "image_grid_pinpoints": [
8
  [
9
- 384,
10
- 384
11
  ],
12
  [
13
- 384,
14
- 768
15
  ],
16
  [
17
- 384,
18
- 1152
19
  ],
20
  [
21
- 384,
22
- 1536
23
  ],
24
  [
25
- 1536,
26
- 384
27
  ],
28
  [
29
- 1152,
30
- 384
31
  ],
32
  [
33
- 768,
34
- 384
35
  ],
36
  [
37
- 768,
38
- 768
39
  ],
40
  [
41
- 768,
42
- 1152
43
  ],
44
  [
45
- 768,
46
- 1536
47
  ],
48
  [
49
- 1536,
50
- 768
51
  ],
52
  [
53
- 1152,
54
- 768
55
  ],
56
  [
57
- 1152,
58
- 1536
59
  ],
60
  [
61
- 1536,
62
- 1152
63
  ],
64
  [
65
- 1152,
66
- 1152
 
 
 
 
67
  ]
68
  ],
69
- "image_seq_length": 182,
70
  "image_token_id": 256001,
71
  "model_type": "kawn_idefics3",
72
  "scale_factor": 2,
@@ -104,12 +107,14 @@
104
  "vision_config": {
105
  "_attn_implementation_autoset": true,
106
  "hidden_size": 1152,
107
- "image_size": 384,
 
108
  "intermediate_size": 4304,
109
  "model_type": "siglip_vision_model",
110
  "num_attention_heads": 16,
111
  "num_hidden_layers": 27,
112
- "patch_size": 14
 
113
  },
114
  "vision_feature_layer": null
115
  }
 
1
  {
 
2
  "architectures": [
3
  "KawnIdefics3ForConditionalGeneration"
4
  ],
5
  "ignore_index": -100,
6
  "image_grid_pinpoints": [
7
  [
8
+ 364,
9
+ 364
10
  ],
11
  [
12
+ 364,
13
+ 728
14
  ],
15
  [
16
+ 364,
17
+ 1092
18
  ],
19
  [
20
+ 1092,
21
+ 364
22
  ],
23
  [
24
+ 728,
25
+ 364
26
  ],
27
  [
28
+ 728,
29
+ 728
30
  ],
31
  [
32
+ 728,
33
+ 1092
34
  ],
35
  [
36
+ 1092,
37
+ 728
38
  ],
39
  [
40
+ 1092,
41
+ 1092
42
  ],
43
  [
44
+ 364,
45
+ 1456
46
  ],
47
  [
48
+ 1456,
49
+ 364
50
  ],
51
  [
52
+ 728,
53
+ 1456
54
  ],
55
  [
56
+ 1456,
57
+ 728
58
  ],
59
  [
60
+ 1456,
61
+ 1092
62
  ],
63
  [
64
+ 1092,
65
+ 1456
66
+ ],
67
+ [
68
+ 1456,
69
+ 1456
70
  ]
71
  ],
72
+ "image_seq_length": 169,
73
  "image_token_id": 256001,
74
  "model_type": "kawn_idefics3",
75
  "scale_factor": 2,
 
107
  "vision_config": {
108
  "_attn_implementation_autoset": true,
109
  "hidden_size": 1152,
110
+ "image_size": 364,
111
+ "initializer_range": 0.02,
112
  "intermediate_size": 4304,
113
  "model_type": "siglip_vision_model",
114
  "num_attention_heads": 16,
115
  "num_hidden_layers": 27,
116
+ "patch_size": 14,
117
+ "vision_use_head": false
118
  },
119
  "vision_feature_layer": null
120
  }
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ba2eda00fa86a92ba7db61f66370ba6fd7142d270864789638f6f2b38264c08e
3
- size 4995096080
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:541b49f43819467781d6c9ee78ea536ca9bf0f1faabad845e3946e652f1bc896
3
+ size 4992807984
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8b41d4dc5bc76fb9f7a806ceb0127e85d41b30011246595938a5b0e20af5b704
3
- size 1111372352
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02171ba9269c1c2ba76a3c0ff3845dbe5de817f04d2462e28772c9d7d4da6a04
3
+ size 1083060272
model.safetensors.index.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "metadata": {
3
- "total_size": 6106368640
4
  },
5
  "weight_map": {
6
  "connector.proj.weight": "model-00002-of-00002.safetensors",
@@ -133,10 +133,10 @@
133
  "language_model.model.layers.19.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
134
  "language_model.model.layers.19.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
135
  "language_model.model.layers.19.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
136
- "language_model.model.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
137
- "language_model.model.layers.19.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
138
- "language_model.model.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
139
- "language_model.model.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
140
  "language_model.model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
141
  "language_model.model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
142
  "language_model.model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
@@ -727,17 +727,6 @@
727
  "vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
728
  "vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
729
  "vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
730
- "vision_tower.vision_model.head.attention.in_proj_bias": "model-00001-of-00002.safetensors",
731
- "vision_tower.vision_model.head.attention.in_proj_weight": "model-00001-of-00002.safetensors",
732
- "vision_tower.vision_model.head.attention.out_proj.bias": "model-00001-of-00002.safetensors",
733
- "vision_tower.vision_model.head.attention.out_proj.weight": "model-00001-of-00002.safetensors",
734
- "vision_tower.vision_model.head.layernorm.bias": "model-00001-of-00002.safetensors",
735
- "vision_tower.vision_model.head.layernorm.weight": "model-00001-of-00002.safetensors",
736
- "vision_tower.vision_model.head.mlp.fc1.bias": "model-00001-of-00002.safetensors",
737
- "vision_tower.vision_model.head.mlp.fc1.weight": "model-00001-of-00002.safetensors",
738
- "vision_tower.vision_model.head.mlp.fc2.bias": "model-00001-of-00002.safetensors",
739
- "vision_tower.vision_model.head.mlp.fc2.weight": "model-00001-of-00002.safetensors",
740
- "vision_tower.vision_model.head.probe": "model-00001-of-00002.safetensors",
741
  "vision_tower.vision_model.post_layernorm.bias": "model-00001-of-00002.safetensors",
742
  "vision_tower.vision_model.post_layernorm.weight": "model-00001-of-00002.safetensors"
743
  }
 
1
  {
2
  "metadata": {
3
+ "total_size": 6075769824
4
  },
5
  "weight_map": {
6
  "connector.proj.weight": "model-00002-of-00002.safetensors",
 
133
  "language_model.model.layers.19.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
134
  "language_model.model.layers.19.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
135
  "language_model.model.layers.19.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
136
+ "language_model.model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
137
+ "language_model.model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
138
+ "language_model.model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
139
+ "language_model.model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
140
  "language_model.model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
141
  "language_model.model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
142
  "language_model.model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
 
727
  "vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
728
  "vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
729
  "vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
 
 
 
 
 
 
 
 
 
 
 
730
  "vision_tower.vision_model.post_layernorm.bias": "model-00001-of-00002.safetensors",
731
  "vision_tower.vision_model.post_layernorm.weight": "model-00001-of-00002.safetensors"
732
  }