MonolithFoundation commited on
Commit
27ffecf
·
verified ·
1 Parent(s): 3e2b004

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "checkpoints/Flrq-2-large-combined/",
3
+ "architectures": [
4
+ "Florence2ForConditionalGeneration"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_florence2.Florence2Config",
9
+ "AutoModelForCausalLM": "modeling_florence2.Florence2ForConditionalGeneration"
10
+ },
11
+ "bos_token_id": 151643,
12
+ "eos_token_id": 151645,
13
+ "hidden_act": "silu",
14
+ "hidden_size": 1536,
15
+ "ignore_index": -100,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 8960,
18
+ "max_position_embeddings": 32768,
19
+ "max_window_layers": 21,
20
+ "model_type": "florence2",
21
+ "num_attention_heads": 12,
22
+ "num_hidden_layers": 28,
23
+ "num_key_value_heads": 2,
24
+ "pad_token_id": 1,
25
+ "projection_dim": 1024,
26
+ "rms_norm_eps": 1e-06,
27
+ "rope_scaling": null,
28
+ "rope_theta": 1000000.0,
29
+ "sliding_window": null,
30
+ "text_config": {
31
+ "_attn_implementation_autoset": true,
32
+ "_name_or_path": "",
33
+ "add_cross_attention": false,
34
+ "architectures": [
35
+ "Qwen2ForCausalLM"
36
+ ],
37
+ "attention_dropout": 0.0,
38
+ "bad_words_ids": null,
39
+ "begin_suppress_tokens": null,
40
+ "bos_token_id": 151643,
41
+ "chunk_size_feed_forward": 0,
42
+ "cross_attention_hidden_size": null,
43
+ "decoder_start_token_id": null,
44
+ "diversity_penalty": 0.0,
45
+ "do_sample": false,
46
+ "early_stopping": false,
47
+ "encoder_no_repeat_ngram_size": 0,
48
+ "eos_token_id": 151645,
49
+ "exponential_decay_length_penalty": null,
50
+ "finetuning_task": null,
51
+ "forced_bos_token_id": null,
52
+ "forced_eos_token_id": null,
53
+ "hidden_act": "silu",
54
+ "hidden_size": 1536,
55
+ "id2label": {
56
+ "0": "LABEL_0",
57
+ "1": "LABEL_1"
58
+ },
59
+ "initializer_range": 0.02,
60
+ "intermediate_size": 8960,
61
+ "is_decoder": false,
62
+ "is_encoder_decoder": false,
63
+ "label2id": {
64
+ "LABEL_0": 0,
65
+ "LABEL_1": 1
66
+ },
67
+ "length_penalty": 1.0,
68
+ "max_length": 20,
69
+ "max_position_embeddings": 32768,
70
+ "max_window_layers": 21,
71
+ "min_length": 0,
72
+ "model_type": "qwen2",
73
+ "no_repeat_ngram_size": 0,
74
+ "num_attention_heads": 12,
75
+ "num_beam_groups": 1,
76
+ "num_beams": 1,
77
+ "num_hidden_layers": 28,
78
+ "num_key_value_heads": 2,
79
+ "num_return_sequences": 1,
80
+ "output_attentions": false,
81
+ "output_hidden_states": false,
82
+ "output_scores": false,
83
+ "pad_token_id": null,
84
+ "prefix": null,
85
+ "problem_type": null,
86
+ "pruned_heads": {},
87
+ "remove_invalid_values": false,
88
+ "repetition_penalty": 1.0,
89
+ "return_dict": true,
90
+ "return_dict_in_generate": false,
91
+ "rms_norm_eps": 1e-06,
92
+ "rope_scaling": null,
93
+ "rope_theta": 1000000.0,
94
+ "sep_token_id": null,
95
+ "sliding_window": null,
96
+ "suppress_tokens": null,
97
+ "task_specific_params": null,
98
+ "temperature": 1.0,
99
+ "tf_legacy_loss": false,
100
+ "tie_encoder_decoder": false,
101
+ "tie_word_embeddings": true,
102
+ "tokenizer_class": null,
103
+ "top_k": 50,
104
+ "top_p": 1.0,
105
+ "torch_dtype": "bfloat16",
106
+ "torchscript": false,
107
+ "typical_p": 1.0,
108
+ "use_bfloat16": false,
109
+ "use_cache": true,
110
+ "use_sliding_window": false,
111
+ "vocab_size": 151936
112
+ },
113
+ "tie_word_embeddings": true,
114
+ "torch_dtype": "bfloat16",
115
+ "transformers_version": "4.46.1",
116
+ "use_cache": false,
117
+ "use_sliding_window": false,
118
+ "vision_config": {
119
+ "_attn_implementation_autoset": false,
120
+ "_name_or_path": "",
121
+ "add_cross_attention": false,
122
+ "architectures": null,
123
+ "bad_words_ids": null,
124
+ "begin_suppress_tokens": null,
125
+ "bos_token_id": null,
126
+ "chunk_size_feed_forward": 0,
127
+ "cross_attention_hidden_size": null,
128
+ "decoder_start_token_id": null,
129
+ "depths": [
130
+ 1,
131
+ 1,
132
+ 9,
133
+ 1
134
+ ],
135
+ "dim_embed": [
136
+ 256,
137
+ 512,
138
+ 1024,
139
+ 2048
140
+ ],
141
+ "diversity_penalty": 0.0,
142
+ "do_sample": false,
143
+ "drop_path_rate": 0.1,
144
+ "early_stopping": false,
145
+ "enable_checkpoint": false,
146
+ "encoder_no_repeat_ngram_size": 0,
147
+ "eos_token_id": null,
148
+ "exponential_decay_length_penalty": null,
149
+ "finetuning_task": null,
150
+ "forced_bos_token_id": null,
151
+ "forced_eos_token_id": null,
152
+ "id2label": {
153
+ "0": "LABEL_0",
154
+ "1": "LABEL_1"
155
+ },
156
+ "image_feature_source": [
157
+ "spatial_avg_pool",
158
+ "temporal_avg_pool"
159
+ ],
160
+ "image_pos_embed": {
161
+ "max_pos_embeddings": 50,
162
+ "type": "learned_abs_2d"
163
+ },
164
+ "is_decoder": false,
165
+ "is_encoder_decoder": false,
166
+ "label2id": {
167
+ "LABEL_0": 0,
168
+ "LABEL_1": 1
169
+ },
170
+ "length_penalty": 1.0,
171
+ "max_length": 20,
172
+ "min_length": 0,
173
+ "model_type": "davit",
174
+ "no_repeat_ngram_size": 0,
175
+ "num_beam_groups": 1,
176
+ "num_beams": 1,
177
+ "num_groups": [
178
+ 8,
179
+ 16,
180
+ 32,
181
+ 64
182
+ ],
183
+ "num_heads": [
184
+ 8,
185
+ 16,
186
+ 32,
187
+ 64
188
+ ],
189
+ "num_return_sequences": 1,
190
+ "output_attentions": false,
191
+ "output_hidden_states": false,
192
+ "output_scores": false,
193
+ "pad_token_id": null,
194
+ "patch_padding": [
195
+ 3,
196
+ 1,
197
+ 1,
198
+ 1
199
+ ],
200
+ "patch_prenorm": [
201
+ false,
202
+ true,
203
+ true,
204
+ true
205
+ ],
206
+ "patch_size": [
207
+ 7,
208
+ 3,
209
+ 3,
210
+ 3
211
+ ],
212
+ "patch_stride": [
213
+ 4,
214
+ 2,
215
+ 2,
216
+ 2
217
+ ],
218
+ "prefix": null,
219
+ "problem_type": null,
220
+ "projection_dim": 1536,
221
+ "pruned_heads": {},
222
+ "remove_invalid_values": false,
223
+ "repetition_penalty": 1.0,
224
+ "return_dict": true,
225
+ "return_dict_in_generate": false,
226
+ "sep_token_id": null,
227
+ "suppress_tokens": null,
228
+ "task_specific_params": null,
229
+ "temperature": 1.0,
230
+ "tf_legacy_loss": false,
231
+ "tie_encoder_decoder": false,
232
+ "tie_word_embeddings": true,
233
+ "tokenizer_class": null,
234
+ "top_k": 50,
235
+ "top_p": 1.0,
236
+ "torch_dtype": null,
237
+ "torchscript": false,
238
+ "typical_p": 1.0,
239
+ "use_bfloat16": false,
240
+ "visual_temporal_embedding": {
241
+ "max_temporal_embeddings": 100,
242
+ "type": "COSINE"
243
+ },
244
+ "window_size": 12
245
+ },
246
+ "vocab_size": 151936
247
+ }
configuration_florence2.py ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import warnings
15
+
16
+ """ Florence-2 configuration"""
17
+
18
+ from typing import Optional
19
+
20
+ from transformers import AutoConfig
21
+ from transformers.configuration_utils import PretrainedConfig
22
+ from transformers.utils import logging
23
+ from transformers import Qwen2Config
24
+
25
+ logger = logging.get_logger(__name__)
26
+
27
+
28
+ class Florence2VisionConfig(PretrainedConfig):
29
+ r"""
30
+ This is the configuration class to store the configuration of a [`Florence2VisionModel`]. It is used to instantiate a Florence2VisionModel
31
+ according to the specified arguments, defining the model architecture. Instantiating a configuration with the
32
+ defaults will yield a similar configuration to that of the Florence2VisionModel architecture.
33
+
34
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
35
+ documentation from [`PretrainedConfig`] for more information.
36
+
37
+ Args:
38
+ drop_path_rate (`float`, *optional*, defaults to 0.1):
39
+ The dropout rate of the drop path layer.
40
+ patch_size (`List[int]`, *optional*, defaults to [7, 3, 3, 3]):
41
+ The patch size of the image.
42
+ patch_stride (`List[int]`, *optional*, defaults to [4, 2, 2, 2]):
43
+ The patch stride of the image.
44
+ patch_padding (`List[int]`, *optional*, defaults to [3, 1, 1, 1]):
45
+ The patch padding of the image.
46
+ patch_prenorm (`List[bool]`, *optional*, defaults to [false, true, true, true]):
47
+ Whether to apply layer normalization before the patch embedding layer.
48
+ enable_checkpoint (`bool`, *optional*, defaults to False):
49
+ Whether to enable checkpointing.
50
+ dim_embed (`List[int]`, *optional*, defaults to [256, 512, 1024, 2048]):
51
+ The dimension of the embedding layer.
52
+ num_heads (`List[int]`, *optional*, defaults to [8, 16, 32, 64]):
53
+ The number of attention heads.
54
+ num_groups (`List[int]`, *optional*, defaults to [8, 16, 32, 64]):
55
+ The number of groups.
56
+ depths (`List[int]`, *optional*, defaults to [1, 1, 9, 1]):
57
+ The depth of the model.
58
+ window_size (`int`, *optional*, defaults to 12):
59
+ The window size of the model.
60
+ projection_dim (`int`, *optional*, defaults to 1024):
61
+ The dimension of the projection layer.
62
+ visual_temporal_embedding (`dict`, *optional*):
63
+ The configuration of the visual temporal embedding.
64
+ image_pos_embed (`dict`, *optional*):
65
+ The configuration of the image position embedding.
66
+ image_feature_source (`List[str]`, *optional*, defaults to ["spatial_avg_pool", "temporal_avg_pool"]):
67
+ The source of the image feature.
68
+ Example:
69
+
70
+ ```python
71
+ >>> from transformers import Florence2VisionConfig, Florence2VisionModel
72
+
73
+ >>> # Initializing a Florence2 Vision style configuration
74
+ >>> configuration = Florence2VisionConfig()
75
+
76
+ >>> # Initializing a model (with random weights)
77
+ >>> model = Florence2VisionModel(configuration)
78
+
79
+ >>> # Accessing the model configuration
80
+ >>> configuration = model.config
81
+ ```"""
82
+
83
+ model_type = "florence2_vision"
84
+ keys_to_ignore_at_inference = ["past_key_values"]
85
+
86
+ def __init__(
87
+ self,
88
+ drop_path_rate=0.1,
89
+ patch_size=[7, 3, 3, 3],
90
+ patch_stride=[4, 2, 2, 2],
91
+ patch_padding=[3, 1, 1, 1],
92
+ patch_prenorm=[False, True, True, True],
93
+ enable_checkpoint=False,
94
+ dim_embed=[256, 512, 1024, 2048],
95
+ num_heads=[8, 16, 32, 64],
96
+ num_groups=[8, 16, 32, 64],
97
+ depths=[1, 1, 9, 1],
98
+ window_size=12,
99
+ projection_dim=1024,
100
+ visual_temporal_embedding=None,
101
+ image_pos_embed=None,
102
+ image_feature_source=["spatial_avg_pool", "temporal_avg_pool"],
103
+ **kwargs,
104
+ ):
105
+ self.drop_path_rate = drop_path_rate
106
+ self.patch_size = patch_size
107
+ self.patch_stride = patch_stride
108
+ self.patch_padding = patch_padding
109
+ self.patch_prenorm = patch_prenorm
110
+ self.enable_checkpoint = enable_checkpoint
111
+ self.dim_embed = dim_embed
112
+ self.num_heads = num_heads
113
+ self.num_groups = num_groups
114
+ self.depths = depths
115
+ self.window_size = window_size
116
+ self.projection_dim = projection_dim
117
+ self.visual_temporal_embedding = visual_temporal_embedding
118
+ self.image_pos_embed = image_pos_embed
119
+ self.image_feature_source = image_feature_source
120
+
121
+ super().__init__(**kwargs)
122
+
123
+
124
+ class Florence2LanguageConfig(PretrainedConfig):
125
+ r"""
126
+ This is the configuration class to store the configuration of a [`Florence2LanguagePreTrainedModel`]. It is used to instantiate a BART
127
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
128
+ defaults will yield a similar configuration to that of the BART
129
+ [facebook/bart-large](https://huggingface.co/facebook/bart-large) architecture.
130
+
131
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
132
+ documentation from [`PretrainedConfig`] for more information.
133
+
134
+
135
+ Args:
136
+ vocab_size (`int`, *optional*, defaults to 51289):
137
+ Vocabulary size of the Florence2Language model. Defines the number of different tokens that can be represented by the
138
+ `inputs_ids` passed when calling [`Florence2LanguageModel`].
139
+ d_model (`int`, *optional*, defaults to 1024):
140
+ Dimensionality of the layers and the pooler layer.
141
+ encoder_layers (`int`, *optional*, defaults to 12):
142
+ Number of encoder layers.
143
+ decoder_layers (`int`, *optional*, defaults to 12):
144
+ Number of decoder layers.
145
+ encoder_attention_heads (`int`, *optional*, defaults to 16):
146
+ Number of attention heads for each attention layer in the Transformer encoder.
147
+ decoder_attention_heads (`int`, *optional*, defaults to 16):
148
+ Number of attention heads for each attention layer in the Transformer decoder.
149
+ decoder_ffn_dim (`int`, *optional*, defaults to 4096):
150
+ Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
151
+ encoder_ffn_dim (`int`, *optional*, defaults to 4096):
152
+ Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
153
+ activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
154
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
155
+ `"relu"`, `"silu"` and `"gelu_new"` are supported.
156
+ dropout (`float`, *optional*, defaults to 0.1):
157
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
158
+ attention_dropout (`float`, *optional*, defaults to 0.0):
159
+ The dropout ratio for the attention probabilities.
160
+ activation_dropout (`float`, *optional*, defaults to 0.0):
161
+ The dropout ratio for activations inside the fully connected layer.
162
+ classifier_dropout (`float`, *optional*, defaults to 0.0):
163
+ The dropout ratio for classifier.
164
+ max_position_embeddings (`int`, *optional*, defaults to 1024):
165
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
166
+ just in case (e.g., 512 or 1024 or 2048).
167
+ init_std (`float`, *optional*, defaults to 0.02):
168
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
169
+ encoder_layerdrop (`float`, *optional*, defaults to 0.0):
170
+ The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
171
+ for more details.
172
+ decoder_layerdrop (`float`, *optional*, defaults to 0.0):
173
+ The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
174
+ for more details.
175
+ scale_embedding (`bool`, *optional*, defaults to `False`):
176
+ Scale embeddings by diving by sqrt(d_model).
177
+ use_cache (`bool`, *optional*, defaults to `True`):
178
+ Whether or not the model should return the last key/values attentions (not used by all models).
179
+ num_labels (`int`, *optional*, defaults to 3):
180
+ The number of labels to use in [`Florence2LanguageForSequenceClassification`].
181
+ forced_eos_token_id (`int`, *optional*, defaults to 2):
182
+ The id of the token to force as the last generated token when `max_length` is reached. Usually set to
183
+ `eos_token_id`.
184
+
185
+ Example:
186
+
187
+ ```python
188
+ >>> from transformers import Florence2LanguageConfig, Florence2LanguageModel
189
+
190
+ >>> # Initializing a Florence2 Language style configuration
191
+ >>> configuration = Florence2LanguageConfig()
192
+
193
+ >>> # Initializing a model (with random weights)
194
+ >>> model = Florence2LangaugeModel(configuration)
195
+
196
+ >>> # Accessing the model configuration
197
+ >>> configuration = model.config
198
+ ```"""
199
+
200
+ model_type = "florence2_language"
201
+ keys_to_ignore_at_inference = ["past_key_values"]
202
+ attribute_map = {
203
+ "num_attention_heads": "encoder_attention_heads",
204
+ "hidden_size": "d_model",
205
+ }
206
+
207
+ def __init__(
208
+ self,
209
+ vocab_size=51289,
210
+ max_position_embeddings=1024,
211
+ encoder_layers=12,
212
+ encoder_ffn_dim=4096,
213
+ encoder_attention_heads=16,
214
+ decoder_layers=12,
215
+ decoder_ffn_dim=4096,
216
+ decoder_attention_heads=16,
217
+ encoder_layerdrop=0.0,
218
+ decoder_layerdrop=0.0,
219
+ activation_function="gelu",
220
+ d_model=1024,
221
+ dropout=0.1,
222
+ attention_dropout=0.0,
223
+ activation_dropout=0.0,
224
+ init_std=0.02,
225
+ classifier_dropout=0.0,
226
+ scale_embedding=False,
227
+ use_cache=True,
228
+ num_labels=3,
229
+ pad_token_id=1,
230
+ bos_token_id=0,
231
+ eos_token_id=2,
232
+ is_encoder_decoder=True,
233
+ decoder_start_token_id=2,
234
+ forced_eos_token_id=2,
235
+ **kwargs,
236
+ ):
237
+ self.vocab_size = vocab_size
238
+ self.max_position_embeddings = max_position_embeddings
239
+ self.d_model = d_model
240
+ self.encoder_ffn_dim = encoder_ffn_dim
241
+ self.encoder_layers = encoder_layers
242
+ self.encoder_attention_heads = encoder_attention_heads
243
+ self.decoder_ffn_dim = decoder_ffn_dim
244
+ self.decoder_layers = decoder_layers
245
+ self.decoder_attention_heads = decoder_attention_heads
246
+ self.dropout = dropout
247
+ self.attention_dropout = attention_dropout
248
+ self.activation_dropout = activation_dropout
249
+ self.activation_function = activation_function
250
+ self.init_std = init_std
251
+ self.encoder_layerdrop = encoder_layerdrop
252
+ self.decoder_layerdrop = decoder_layerdrop
253
+ self.classifier_dropout = classifier_dropout
254
+ self.use_cache = use_cache
255
+ self.num_hidden_layers = encoder_layers
256
+ self.scale_embedding = (
257
+ scale_embedding # scale factor will be sqrt(d_model) if True
258
+ )
259
+
260
+ super().__init__(
261
+ num_labels=num_labels,
262
+ pad_token_id=pad_token_id,
263
+ bos_token_id=bos_token_id,
264
+ eos_token_id=eos_token_id,
265
+ is_encoder_decoder=is_encoder_decoder,
266
+ decoder_start_token_id=decoder_start_token_id,
267
+ forced_eos_token_id=forced_eos_token_id,
268
+ **kwargs,
269
+ )
270
+
271
+ # ensure backward compatibility for BART CNN models
272
+ if self.forced_bos_token_id is None and kwargs.get(
273
+ "force_bos_token_to_be_generated", False
274
+ ):
275
+ self.forced_bos_token_id = self.bos_token_id
276
+ warnings.warn(
277
+ f"Please make sure the config includes `forced_bos_token_id={self.bos_token_id}` in future versions. "
278
+ "The config can simply be saved and uploaded again to be fixed."
279
+ )
280
+
281
+
282
+ class Florence2Config(Qwen2Config):
283
+ r"""
284
+ This is the configuration class to store the configuration of a [`Florence2ForConditionalGeneration`]. It is used to instantiate an
285
+ Florence-2 model according to the specified arguments, defining the model architecture.
286
+
287
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
288
+ documentation from [`PretrainedConfig`] for more information.
289
+
290
+ Args:
291
+ vision_config (`Florence2VisionConfig`, *optional*):
292
+ Custom vision config or dict
293
+ text_config (`Union[AutoConfig, dict]`, *optional*):
294
+ The config object of the text backbone.
295
+ ignore_index (`int`, *optional*, defaults to -100):
296
+ The ignore index for the loss function.
297
+ vocab_size (`int`, *optional*, defaults to 51289):
298
+ Vocabulary size of the Florence2model. Defines the number of different tokens that can be represented by the
299
+ `inputs_ids` passed when calling [`~Florence2ForConditionalGeneration`]
300
+ projection_dim (`int`, *optional*, defaults to 1024):
301
+ Dimension of the multimodal projection space.
302
+
303
+ Example:
304
+
305
+ ```python
306
+ >>> from transformers import Florence2ForConditionalGeneration, Florence2Config, CLIPVisionConfig, BartConfig
307
+
308
+ >>> # Initializing a clip-like vision config
309
+ >>> vision_config = CLIPVisionConfig()
310
+
311
+ >>> # Initializing a Bart config
312
+ >>> text_config = BartConfig()
313
+
314
+ >>> # Initializing a Florence-2 configuration
315
+ >>> configuration = Florence2Config(vision_config, text_config)
316
+
317
+ >>> # Initializing a model from the florence-2 configuration
318
+ >>> model = Florence2ForConditionalGeneration(configuration)
319
+
320
+ >>> # Accessing the model configuration
321
+ >>> configuration = model.config
322
+ ```"""
323
+
324
+ model_type = "florence2"
325
+ is_composition = False
326
+
327
+ def __init__(
328
+ self,
329
+ vision_config=None,
330
+ text_config=None,
331
+ ignore_index=-100,
332
+ vocab_size=51289,
333
+ projection_dim=1024,
334
+ **kwargs,
335
+ ):
336
+ self.ignore_index = ignore_index
337
+ self.vocab_size = vocab_size
338
+ self.projection_dim = projection_dim
339
+ if vision_config is not None:
340
+ vision_config = PretrainedConfig(**vision_config)
341
+ self.vision_config = vision_config
342
+ self.vocab_size = self.vocab_size
343
+
344
+ self.text_config = text_config
345
+ if text_config is not None:
346
+ # self.text_config = Florence2LanguageConfig(**text_config)
347
+ self.text_config = Qwen2Config(**text_config)
348
+
349
+ super().__init__(**kwargs)
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 151643,
4
+ "eos_token_id": 151645,
5
+ "pad_token_id": 1,
6
+ "transformers_version": "4.46.1"
7
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1646640fb73d58ca5877fb55d5eb9cce8b69d0e3889d42646dcb3c1b865e1da2
3
+ size 3815697496
model.safetensors.index.json ADDED
@@ -0,0 +1,751 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 7631210496
4
+ },
5
+ "weight_map": {
6
+ "image_pos_embed.column_embeddings.weight": "model-00002-of-00002.safetensors",
7
+ "image_pos_embed.row_embeddings.weight": "model-00002-of-00002.safetensors",
8
+ "image_proj_norm.bias": "model-00002-of-00002.safetensors",
9
+ "image_proj_norm.weight": "model-00002-of-00002.safetensors",
10
+ "image_projection": "model-00001-of-00002.safetensors",
11
+ "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
12
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
13
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
14
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
15
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
16
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
17
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
18
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
19
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
20
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
21
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
22
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
23
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
24
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
25
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
26
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
27
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
28
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
29
+ "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
30
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
31
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
32
+ "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
33
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
34
+ "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
35
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
36
+ "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
37
+ "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
38
+ "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
39
+ "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
40
+ "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
41
+ "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
42
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
43
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
44
+ "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
45
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
46
+ "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
47
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
48
+ "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
49
+ "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
50
+ "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
51
+ "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
52
+ "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
53
+ "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
54
+ "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
55
+ "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
56
+ "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
57
+ "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
58
+ "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
59
+ "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
60
+ "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
61
+ "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
62
+ "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
63
+ "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
64
+ "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
65
+ "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
66
+ "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
67
+ "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
68
+ "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
69
+ "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
70
+ "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
71
+ "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
72
+ "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
73
+ "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
74
+ "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
75
+ "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
76
+ "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
77
+ "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
78
+ "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
79
+ "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
80
+ "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
81
+ "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
82
+ "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
83
+ "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
84
+ "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
85
+ "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
86
+ "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
87
+ "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
88
+ "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
89
+ "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
90
+ "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
91
+ "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
92
+ "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
93
+ "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
94
+ "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
95
+ "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
96
+ "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
97
+ "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
98
+ "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
99
+ "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
100
+ "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
101
+ "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
102
+ "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
103
+ "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
104
+ "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
105
+ "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
106
+ "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
107
+ "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
108
+ "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
109
+ "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
110
+ "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
111
+ "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
112
+ "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
113
+ "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
114
+ "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
115
+ "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
116
+ "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
117
+ "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
118
+ "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
119
+ "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
120
+ "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
121
+ "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
122
+ "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
123
+ "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
124
+ "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
125
+ "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
126
+ "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
127
+ "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
128
+ "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
129
+ "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
130
+ "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
131
+ "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
132
+ "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
133
+ "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
134
+ "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
135
+ "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
136
+ "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
137
+ "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
138
+ "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
139
+ "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
140
+ "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
141
+ "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
142
+ "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
143
+ "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
144
+ "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
145
+ "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
146
+ "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
147
+ "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
148
+ "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
149
+ "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
150
+ "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
151
+ "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
152
+ "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
153
+ "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
154
+ "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
155
+ "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
156
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
157
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
158
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
159
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
160
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
161
+ "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
162
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
163
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
164
+ "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
165
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
166
+ "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
167
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
168
+ "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors",
169
+ "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
170
+ "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
171
+ "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
172
+ "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
173
+ "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
174
+ "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
175
+ "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
176
+ "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
177
+ "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
178
+ "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
179
+ "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
180
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
181
+ "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
182
+ "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
183
+ "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
184
+ "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
185
+ "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
186
+ "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
187
+ "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
188
+ "model.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
189
+ "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
190
+ "model.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
191
+ "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
192
+ "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
193
+ "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
194
+ "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
195
+ "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
196
+ "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
197
+ "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
198
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
199
+ "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
200
+ "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
201
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
202
+ "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
203
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
204
+ "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
205
+ "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
206
+ "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
207
+ "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
208
+ "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
209
+ "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
210
+ "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
211
+ "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
212
+ "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
213
+ "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
214
+ "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
215
+ "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
216
+ "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
217
+ "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
218
+ "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
219
+ "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
220
+ "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
221
+ "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
222
+ "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
223
+ "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
224
+ "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
225
+ "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
226
+ "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
227
+ "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
228
+ "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
229
+ "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
230
+ "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
231
+ "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
232
+ "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
233
+ "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
234
+ "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
235
+ "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
236
+ "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
237
+ "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
238
+ "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
239
+ "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
240
+ "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
241
+ "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
242
+ "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
243
+ "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
244
+ "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
245
+ "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
246
+ "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
247
+ "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
248
+ "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
249
+ "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
250
+ "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
251
+ "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
252
+ "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
253
+ "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
254
+ "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
255
+ "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
256
+ "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
257
+ "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
258
+ "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
259
+ "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
260
+ "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
261
+ "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
262
+ "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
263
+ "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
264
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
265
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
266
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
267
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
268
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
269
+ "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
270
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
271
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
272
+ "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
273
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
274
+ "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
275
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
276
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
277
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
278
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
279
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
280
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
281
+ "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
282
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
283
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
284
+ "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
285
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
286
+ "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
287
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
288
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
289
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
290
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
291
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
292
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
293
+ "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
294
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
295
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
296
+ "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
297
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
298
+ "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
299
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
300
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
301
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
302
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
303
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
304
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
305
+ "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
306
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
307
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
308
+ "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
309
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
310
+ "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
311
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
312
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
313
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
314
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
315
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
316
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
317
+ "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
318
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
319
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
320
+ "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
321
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
322
+ "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
323
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
324
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
325
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
326
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
327
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
328
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
329
+ "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
330
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
331
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
332
+ "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
333
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
334
+ "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
335
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
336
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
337
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
338
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
339
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
340
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
341
+ "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
342
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
343
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
344
+ "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
345
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
346
+ "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
347
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
348
+ "model.norm.weight": "model-00002-of-00002.safetensors",
349
+ "vision_tower.blocks.0.0.channel_block.channel_attn.fn.proj.bias": "model-00002-of-00002.safetensors",
350
+ "vision_tower.blocks.0.0.channel_block.channel_attn.fn.proj.weight": "model-00002-of-00002.safetensors",
351
+ "vision_tower.blocks.0.0.channel_block.channel_attn.fn.qkv.bias": "model-00002-of-00002.safetensors",
352
+ "vision_tower.blocks.0.0.channel_block.channel_attn.fn.qkv.weight": "model-00002-of-00002.safetensors",
353
+ "vision_tower.blocks.0.0.channel_block.channel_attn.norm.bias": "model-00002-of-00002.safetensors",
354
+ "vision_tower.blocks.0.0.channel_block.channel_attn.norm.weight": "model-00002-of-00002.safetensors",
355
+ "vision_tower.blocks.0.0.channel_block.conv1.fn.dw.bias": "model-00002-of-00002.safetensors",
356
+ "vision_tower.blocks.0.0.channel_block.conv1.fn.dw.weight": "model-00002-of-00002.safetensors",
357
+ "vision_tower.blocks.0.0.channel_block.conv2.fn.dw.bias": "model-00002-of-00002.safetensors",
358
+ "vision_tower.blocks.0.0.channel_block.conv2.fn.dw.weight": "model-00002-of-00002.safetensors",
359
+ "vision_tower.blocks.0.0.channel_block.ffn.fn.net.fc1.bias": "model-00002-of-00002.safetensors",
360
+ "vision_tower.blocks.0.0.channel_block.ffn.fn.net.fc1.weight": "model-00002-of-00002.safetensors",
361
+ "vision_tower.blocks.0.0.channel_block.ffn.fn.net.fc2.bias": "model-00002-of-00002.safetensors",
362
+ "vision_tower.blocks.0.0.channel_block.ffn.fn.net.fc2.weight": "model-00002-of-00002.safetensors",
363
+ "vision_tower.blocks.0.0.channel_block.ffn.norm.bias": "model-00002-of-00002.safetensors",
364
+ "vision_tower.blocks.0.0.channel_block.ffn.norm.weight": "model-00002-of-00002.safetensors",
365
+ "vision_tower.blocks.0.0.spatial_block.conv1.fn.dw.bias": "model-00002-of-00002.safetensors",
366
+ "vision_tower.blocks.0.0.spatial_block.conv1.fn.dw.weight": "model-00002-of-00002.safetensors",
367
+ "vision_tower.blocks.0.0.spatial_block.conv2.fn.dw.bias": "model-00002-of-00002.safetensors",
368
+ "vision_tower.blocks.0.0.spatial_block.conv2.fn.dw.weight": "model-00002-of-00002.safetensors",
369
+ "vision_tower.blocks.0.0.spatial_block.ffn.fn.net.fc1.bias": "model-00002-of-00002.safetensors",
370
+ "vision_tower.blocks.0.0.spatial_block.ffn.fn.net.fc1.weight": "model-00002-of-00002.safetensors",
371
+ "vision_tower.blocks.0.0.spatial_block.ffn.fn.net.fc2.bias": "model-00002-of-00002.safetensors",
372
+ "vision_tower.blocks.0.0.spatial_block.ffn.fn.net.fc2.weight": "model-00002-of-00002.safetensors",
373
+ "vision_tower.blocks.0.0.spatial_block.ffn.norm.bias": "model-00002-of-00002.safetensors",
374
+ "vision_tower.blocks.0.0.spatial_block.ffn.norm.weight": "model-00002-of-00002.safetensors",
375
+ "vision_tower.blocks.0.0.spatial_block.window_attn.fn.proj.bias": "model-00002-of-00002.safetensors",
376
+ "vision_tower.blocks.0.0.spatial_block.window_attn.fn.proj.weight": "model-00002-of-00002.safetensors",
377
+ "vision_tower.blocks.0.0.spatial_block.window_attn.fn.qkv.bias": "model-00002-of-00002.safetensors",
378
+ "vision_tower.blocks.0.0.spatial_block.window_attn.fn.qkv.weight": "model-00002-of-00002.safetensors",
379
+ "vision_tower.blocks.0.0.spatial_block.window_attn.norm.bias": "model-00002-of-00002.safetensors",
380
+ "vision_tower.blocks.0.0.spatial_block.window_attn.norm.weight": "model-00002-of-00002.safetensors",
381
+ "vision_tower.blocks.1.0.channel_block.channel_attn.fn.proj.bias": "model-00002-of-00002.safetensors",
382
+ "vision_tower.blocks.1.0.channel_block.channel_attn.fn.proj.weight": "model-00002-of-00002.safetensors",
383
+ "vision_tower.blocks.1.0.channel_block.channel_attn.fn.qkv.bias": "model-00002-of-00002.safetensors",
384
+ "vision_tower.blocks.1.0.channel_block.channel_attn.fn.qkv.weight": "model-00002-of-00002.safetensors",
385
+ "vision_tower.blocks.1.0.channel_block.channel_attn.norm.bias": "model-00002-of-00002.safetensors",
386
+ "vision_tower.blocks.1.0.channel_block.channel_attn.norm.weight": "model-00002-of-00002.safetensors",
387
+ "vision_tower.blocks.1.0.channel_block.conv1.fn.dw.bias": "model-00002-of-00002.safetensors",
388
+ "vision_tower.blocks.1.0.channel_block.conv1.fn.dw.weight": "model-00002-of-00002.safetensors",
389
+ "vision_tower.blocks.1.0.channel_block.conv2.fn.dw.bias": "model-00002-of-00002.safetensors",
390
+ "vision_tower.blocks.1.0.channel_block.conv2.fn.dw.weight": "model-00002-of-00002.safetensors",
391
+ "vision_tower.blocks.1.0.channel_block.ffn.fn.net.fc1.bias": "model-00002-of-00002.safetensors",
392
+ "vision_tower.blocks.1.0.channel_block.ffn.fn.net.fc1.weight": "model-00002-of-00002.safetensors",
393
+ "vision_tower.blocks.1.0.channel_block.ffn.fn.net.fc2.bias": "model-00002-of-00002.safetensors",
394
+ "vision_tower.blocks.1.0.channel_block.ffn.fn.net.fc2.weight": "model-00002-of-00002.safetensors",
395
+ "vision_tower.blocks.1.0.channel_block.ffn.norm.bias": "model-00002-of-00002.safetensors",
396
+ "vision_tower.blocks.1.0.channel_block.ffn.norm.weight": "model-00002-of-00002.safetensors",
397
+ "vision_tower.blocks.1.0.spatial_block.conv1.fn.dw.bias": "model-00002-of-00002.safetensors",
398
+ "vision_tower.blocks.1.0.spatial_block.conv1.fn.dw.weight": "model-00002-of-00002.safetensors",
399
+ "vision_tower.blocks.1.0.spatial_block.conv2.fn.dw.bias": "model-00002-of-00002.safetensors",
400
+ "vision_tower.blocks.1.0.spatial_block.conv2.fn.dw.weight": "model-00002-of-00002.safetensors",
401
+ "vision_tower.blocks.1.0.spatial_block.ffn.fn.net.fc1.bias": "model-00002-of-00002.safetensors",
402
+ "vision_tower.blocks.1.0.spatial_block.ffn.fn.net.fc1.weight": "model-00002-of-00002.safetensors",
403
+ "vision_tower.blocks.1.0.spatial_block.ffn.fn.net.fc2.bias": "model-00002-of-00002.safetensors",
404
+ "vision_tower.blocks.1.0.spatial_block.ffn.fn.net.fc2.weight": "model-00002-of-00002.safetensors",
405
+ "vision_tower.blocks.1.0.spatial_block.ffn.norm.bias": "model-00002-of-00002.safetensors",
406
+ "vision_tower.blocks.1.0.spatial_block.ffn.norm.weight": "model-00002-of-00002.safetensors",
407
+ "vision_tower.blocks.1.0.spatial_block.window_attn.fn.proj.bias": "model-00002-of-00002.safetensors",
408
+ "vision_tower.blocks.1.0.spatial_block.window_attn.fn.proj.weight": "model-00002-of-00002.safetensors",
409
+ "vision_tower.blocks.1.0.spatial_block.window_attn.fn.qkv.bias": "model-00002-of-00002.safetensors",
410
+ "vision_tower.blocks.1.0.spatial_block.window_attn.fn.qkv.weight": "model-00002-of-00002.safetensors",
411
+ "vision_tower.blocks.1.0.spatial_block.window_attn.norm.bias": "model-00002-of-00002.safetensors",
412
+ "vision_tower.blocks.1.0.spatial_block.window_attn.norm.weight": "model-00002-of-00002.safetensors",
413
+ "vision_tower.blocks.2.0.channel_block.channel_attn.fn.proj.bias": "model-00002-of-00002.safetensors",
414
+ "vision_tower.blocks.2.0.channel_block.channel_attn.fn.proj.weight": "model-00002-of-00002.safetensors",
415
+ "vision_tower.blocks.2.0.channel_block.channel_attn.fn.qkv.bias": "model-00002-of-00002.safetensors",
416
+ "vision_tower.blocks.2.0.channel_block.channel_attn.fn.qkv.weight": "model-00002-of-00002.safetensors",
417
+ "vision_tower.blocks.2.0.channel_block.channel_attn.norm.bias": "model-00002-of-00002.safetensors",
418
+ "vision_tower.blocks.2.0.channel_block.channel_attn.norm.weight": "model-00002-of-00002.safetensors",
419
+ "vision_tower.blocks.2.0.channel_block.conv1.fn.dw.bias": "model-00002-of-00002.safetensors",
420
+ "vision_tower.blocks.2.0.channel_block.conv1.fn.dw.weight": "model-00002-of-00002.safetensors",
421
+ "vision_tower.blocks.2.0.channel_block.conv2.fn.dw.bias": "model-00002-of-00002.safetensors",
422
+ "vision_tower.blocks.2.0.channel_block.conv2.fn.dw.weight": "model-00002-of-00002.safetensors",
423
+ "vision_tower.blocks.2.0.channel_block.ffn.fn.net.fc1.bias": "model-00002-of-00002.safetensors",
424
+ "vision_tower.blocks.2.0.channel_block.ffn.fn.net.fc1.weight": "model-00002-of-00002.safetensors",
425
+ "vision_tower.blocks.2.0.channel_block.ffn.fn.net.fc2.bias": "model-00002-of-00002.safetensors",
426
+ "vision_tower.blocks.2.0.channel_block.ffn.fn.net.fc2.weight": "model-00002-of-00002.safetensors",
427
+ "vision_tower.blocks.2.0.channel_block.ffn.norm.bias": "model-00002-of-00002.safetensors",
428
+ "vision_tower.blocks.2.0.channel_block.ffn.norm.weight": "model-00002-of-00002.safetensors",
429
+ "vision_tower.blocks.2.0.spatial_block.conv1.fn.dw.bias": "model-00002-of-00002.safetensors",
430
+ "vision_tower.blocks.2.0.spatial_block.conv1.fn.dw.weight": "model-00002-of-00002.safetensors",
431
+ "vision_tower.blocks.2.0.spatial_block.conv2.fn.dw.bias": "model-00002-of-00002.safetensors",
432
+ "vision_tower.blocks.2.0.spatial_block.conv2.fn.dw.weight": "model-00002-of-00002.safetensors",
433
+ "vision_tower.blocks.2.0.spatial_block.ffn.fn.net.fc1.bias": "model-00002-of-00002.safetensors",
434
+ "vision_tower.blocks.2.0.spatial_block.ffn.fn.net.fc1.weight": "model-00002-of-00002.safetensors",
435
+ "vision_tower.blocks.2.0.spatial_block.ffn.fn.net.fc2.bias": "model-00002-of-00002.safetensors",
436
+ "vision_tower.blocks.2.0.spatial_block.ffn.fn.net.fc2.weight": "model-00002-of-00002.safetensors",
437
+ "vision_tower.blocks.2.0.spatial_block.ffn.norm.bias": "model-00002-of-00002.safetensors",
438
+ "vision_tower.blocks.2.0.spatial_block.ffn.norm.weight": "model-00002-of-00002.safetensors",
439
+ "vision_tower.blocks.2.0.spatial_block.window_attn.fn.proj.bias": "model-00002-of-00002.safetensors",
440
+ "vision_tower.blocks.2.0.spatial_block.window_attn.fn.proj.weight": "model-00002-of-00002.safetensors",
441
+ "vision_tower.blocks.2.0.spatial_block.window_attn.fn.qkv.bias": "model-00002-of-00002.safetensors",
442
+ "vision_tower.blocks.2.0.spatial_block.window_attn.fn.qkv.weight": "model-00002-of-00002.safetensors",
443
+ "vision_tower.blocks.2.0.spatial_block.window_attn.norm.bias": "model-00002-of-00002.safetensors",
444
+ "vision_tower.blocks.2.0.spatial_block.window_attn.norm.weight": "model-00002-of-00002.safetensors",
445
+ "vision_tower.blocks.2.1.channel_block.channel_attn.fn.proj.bias": "model-00002-of-00002.safetensors",
446
+ "vision_tower.blocks.2.1.channel_block.channel_attn.fn.proj.weight": "model-00002-of-00002.safetensors",
447
+ "vision_tower.blocks.2.1.channel_block.channel_attn.fn.qkv.bias": "model-00002-of-00002.safetensors",
448
+ "vision_tower.blocks.2.1.channel_block.channel_attn.fn.qkv.weight": "model-00002-of-00002.safetensors",
449
+ "vision_tower.blocks.2.1.channel_block.channel_attn.norm.bias": "model-00002-of-00002.safetensors",
450
+ "vision_tower.blocks.2.1.channel_block.channel_attn.norm.weight": "model-00002-of-00002.safetensors",
451
+ "vision_tower.blocks.2.1.channel_block.conv1.fn.dw.bias": "model-00002-of-00002.safetensors",
452
+ "vision_tower.blocks.2.1.channel_block.conv1.fn.dw.weight": "model-00002-of-00002.safetensors",
453
+ "vision_tower.blocks.2.1.channel_block.conv2.fn.dw.bias": "model-00002-of-00002.safetensors",
454
+ "vision_tower.blocks.2.1.channel_block.conv2.fn.dw.weight": "model-00002-of-00002.safetensors",
455
+ "vision_tower.blocks.2.1.channel_block.ffn.fn.net.fc1.bias": "model-00002-of-00002.safetensors",
456
+ "vision_tower.blocks.2.1.channel_block.ffn.fn.net.fc1.weight": "model-00002-of-00002.safetensors",
457
+ "vision_tower.blocks.2.1.channel_block.ffn.fn.net.fc2.bias": "model-00002-of-00002.safetensors",
458
+ "vision_tower.blocks.2.1.channel_block.ffn.fn.net.fc2.weight": "model-00002-of-00002.safetensors",
459
+ "vision_tower.blocks.2.1.channel_block.ffn.norm.bias": "model-00002-of-00002.safetensors",
460
+ "vision_tower.blocks.2.1.channel_block.ffn.norm.weight": "model-00002-of-00002.safetensors",
461
+ "vision_tower.blocks.2.1.spatial_block.conv1.fn.dw.bias": "model-00002-of-00002.safetensors",
462
+ "vision_tower.blocks.2.1.spatial_block.conv1.fn.dw.weight": "model-00002-of-00002.safetensors",
463
+ "vision_tower.blocks.2.1.spatial_block.conv2.fn.dw.bias": "model-00002-of-00002.safetensors",
464
+ "vision_tower.blocks.2.1.spatial_block.conv2.fn.dw.weight": "model-00002-of-00002.safetensors",
465
+ "vision_tower.blocks.2.1.spatial_block.ffn.fn.net.fc1.bias": "model-00002-of-00002.safetensors",
466
+ "vision_tower.blocks.2.1.spatial_block.ffn.fn.net.fc1.weight": "model-00002-of-00002.safetensors",
467
+ "vision_tower.blocks.2.1.spatial_block.ffn.fn.net.fc2.bias": "model-00002-of-00002.safetensors",
468
+ "vision_tower.blocks.2.1.spatial_block.ffn.fn.net.fc2.weight": "model-00002-of-00002.safetensors",
469
+ "vision_tower.blocks.2.1.spatial_block.ffn.norm.bias": "model-00002-of-00002.safetensors",
470
+ "vision_tower.blocks.2.1.spatial_block.ffn.norm.weight": "model-00002-of-00002.safetensors",
471
+ "vision_tower.blocks.2.1.spatial_block.window_attn.fn.proj.bias": "model-00002-of-00002.safetensors",
472
+ "vision_tower.blocks.2.1.spatial_block.window_attn.fn.proj.weight": "model-00002-of-00002.safetensors",
473
+ "vision_tower.blocks.2.1.spatial_block.window_attn.fn.qkv.bias": "model-00002-of-00002.safetensors",
474
+ "vision_tower.blocks.2.1.spatial_block.window_attn.fn.qkv.weight": "model-00002-of-00002.safetensors",
475
+ "vision_tower.blocks.2.1.spatial_block.window_attn.norm.bias": "model-00002-of-00002.safetensors",
476
+ "vision_tower.blocks.2.1.spatial_block.window_attn.norm.weight": "model-00002-of-00002.safetensors",
477
+ "vision_tower.blocks.2.2.channel_block.channel_attn.fn.proj.bias": "model-00002-of-00002.safetensors",
478
+ "vision_tower.blocks.2.2.channel_block.channel_attn.fn.proj.weight": "model-00002-of-00002.safetensors",
479
+ "vision_tower.blocks.2.2.channel_block.channel_attn.fn.qkv.bias": "model-00002-of-00002.safetensors",
480
+ "vision_tower.blocks.2.2.channel_block.channel_attn.fn.qkv.weight": "model-00002-of-00002.safetensors",
481
+ "vision_tower.blocks.2.2.channel_block.channel_attn.norm.bias": "model-00002-of-00002.safetensors",
482
+ "vision_tower.blocks.2.2.channel_block.channel_attn.norm.weight": "model-00002-of-00002.safetensors",
483
+ "vision_tower.blocks.2.2.channel_block.conv1.fn.dw.bias": "model-00002-of-00002.safetensors",
484
+ "vision_tower.blocks.2.2.channel_block.conv1.fn.dw.weight": "model-00002-of-00002.safetensors",
485
+ "vision_tower.blocks.2.2.channel_block.conv2.fn.dw.bias": "model-00002-of-00002.safetensors",
486
+ "vision_tower.blocks.2.2.channel_block.conv2.fn.dw.weight": "model-00002-of-00002.safetensors",
487
+ "vision_tower.blocks.2.2.channel_block.ffn.fn.net.fc1.bias": "model-00002-of-00002.safetensors",
488
+ "vision_tower.blocks.2.2.channel_block.ffn.fn.net.fc1.weight": "model-00002-of-00002.safetensors",
489
+ "vision_tower.blocks.2.2.channel_block.ffn.fn.net.fc2.bias": "model-00002-of-00002.safetensors",
490
+ "vision_tower.blocks.2.2.channel_block.ffn.fn.net.fc2.weight": "model-00002-of-00002.safetensors",
491
+ "vision_tower.blocks.2.2.channel_block.ffn.norm.bias": "model-00002-of-00002.safetensors",
492
+ "vision_tower.blocks.2.2.channel_block.ffn.norm.weight": "model-00002-of-00002.safetensors",
493
+ "vision_tower.blocks.2.2.spatial_block.conv1.fn.dw.bias": "model-00002-of-00002.safetensors",
494
+ "vision_tower.blocks.2.2.spatial_block.conv1.fn.dw.weight": "model-00002-of-00002.safetensors",
495
+ "vision_tower.blocks.2.2.spatial_block.conv2.fn.dw.bias": "model-00002-of-00002.safetensors",
496
+ "vision_tower.blocks.2.2.spatial_block.conv2.fn.dw.weight": "model-00002-of-00002.safetensors",
497
+ "vision_tower.blocks.2.2.spatial_block.ffn.fn.net.fc1.bias": "model-00002-of-00002.safetensors",
498
+ "vision_tower.blocks.2.2.spatial_block.ffn.fn.net.fc1.weight": "model-00002-of-00002.safetensors",
499
+ "vision_tower.blocks.2.2.spatial_block.ffn.fn.net.fc2.bias": "model-00002-of-00002.safetensors",
500
+ "vision_tower.blocks.2.2.spatial_block.ffn.fn.net.fc2.weight": "model-00002-of-00002.safetensors",
501
+ "vision_tower.blocks.2.2.spatial_block.ffn.norm.bias": "model-00002-of-00002.safetensors",
502
+ "vision_tower.blocks.2.2.spatial_block.ffn.norm.weight": "model-00002-of-00002.safetensors",
503
+ "vision_tower.blocks.2.2.spatial_block.window_attn.fn.proj.bias": "model-00002-of-00002.safetensors",
504
+ "vision_tower.blocks.2.2.spatial_block.window_attn.fn.proj.weight": "model-00002-of-00002.safetensors",
505
+ "vision_tower.blocks.2.2.spatial_block.window_attn.fn.qkv.bias": "model-00002-of-00002.safetensors",
506
+ "vision_tower.blocks.2.2.spatial_block.window_attn.fn.qkv.weight": "model-00002-of-00002.safetensors",
507
+ "vision_tower.blocks.2.2.spatial_block.window_attn.norm.bias": "model-00002-of-00002.safetensors",
508
+ "vision_tower.blocks.2.2.spatial_block.window_attn.norm.weight": "model-00002-of-00002.safetensors",
509
+ "vision_tower.blocks.2.3.channel_block.channel_attn.fn.proj.bias": "model-00002-of-00002.safetensors",
510
+ "vision_tower.blocks.2.3.channel_block.channel_attn.fn.proj.weight": "model-00002-of-00002.safetensors",
511
+ "vision_tower.blocks.2.3.channel_block.channel_attn.fn.qkv.bias": "model-00002-of-00002.safetensors",
512
+ "vision_tower.blocks.2.3.channel_block.channel_attn.fn.qkv.weight": "model-00002-of-00002.safetensors",
513
+ "vision_tower.blocks.2.3.channel_block.channel_attn.norm.bias": "model-00002-of-00002.safetensors",
514
+ "vision_tower.blocks.2.3.channel_block.channel_attn.norm.weight": "model-00002-of-00002.safetensors",
515
+ "vision_tower.blocks.2.3.channel_block.conv1.fn.dw.bias": "model-00002-of-00002.safetensors",
516
+ "vision_tower.blocks.2.3.channel_block.conv1.fn.dw.weight": "model-00002-of-00002.safetensors",
517
+ "vision_tower.blocks.2.3.channel_block.conv2.fn.dw.bias": "model-00002-of-00002.safetensors",
518
+ "vision_tower.blocks.2.3.channel_block.conv2.fn.dw.weight": "model-00002-of-00002.safetensors",
519
+ "vision_tower.blocks.2.3.channel_block.ffn.fn.net.fc1.bias": "model-00002-of-00002.safetensors",
520
+ "vision_tower.blocks.2.3.channel_block.ffn.fn.net.fc1.weight": "model-00002-of-00002.safetensors",
521
+ "vision_tower.blocks.2.3.channel_block.ffn.fn.net.fc2.bias": "model-00002-of-00002.safetensors",
522
+ "vision_tower.blocks.2.3.channel_block.ffn.fn.net.fc2.weight": "model-00002-of-00002.safetensors",
523
+ "vision_tower.blocks.2.3.channel_block.ffn.norm.bias": "model-00002-of-00002.safetensors",
524
+ "vision_tower.blocks.2.3.channel_block.ffn.norm.weight": "model-00002-of-00002.safetensors",
525
+ "vision_tower.blocks.2.3.spatial_block.conv1.fn.dw.bias": "model-00002-of-00002.safetensors",
526
+ "vision_tower.blocks.2.3.spatial_block.conv1.fn.dw.weight": "model-00002-of-00002.safetensors",
527
+ "vision_tower.blocks.2.3.spatial_block.conv2.fn.dw.bias": "model-00002-of-00002.safetensors",
528
+ "vision_tower.blocks.2.3.spatial_block.conv2.fn.dw.weight": "model-00002-of-00002.safetensors",
529
+ "vision_tower.blocks.2.3.spatial_block.ffn.fn.net.fc1.bias": "model-00002-of-00002.safetensors",
530
+ "vision_tower.blocks.2.3.spatial_block.ffn.fn.net.fc1.weight": "model-00002-of-00002.safetensors",
531
+ "vision_tower.blocks.2.3.spatial_block.ffn.fn.net.fc2.bias": "model-00002-of-00002.safetensors",
532
+ "vision_tower.blocks.2.3.spatial_block.ffn.fn.net.fc2.weight": "model-00002-of-00002.safetensors",
533
+ "vision_tower.blocks.2.3.spatial_block.ffn.norm.bias": "model-00002-of-00002.safetensors",
534
+ "vision_tower.blocks.2.3.spatial_block.ffn.norm.weight": "model-00002-of-00002.safetensors",
535
+ "vision_tower.blocks.2.3.spatial_block.window_attn.fn.proj.bias": "model-00002-of-00002.safetensors",
536
+ "vision_tower.blocks.2.3.spatial_block.window_attn.fn.proj.weight": "model-00002-of-00002.safetensors",
537
+ "vision_tower.blocks.2.3.spatial_block.window_attn.fn.qkv.bias": "model-00002-of-00002.safetensors",
538
+ "vision_tower.blocks.2.3.spatial_block.window_attn.fn.qkv.weight": "model-00002-of-00002.safetensors",
539
+ "vision_tower.blocks.2.3.spatial_block.window_attn.norm.bias": "model-00002-of-00002.safetensors",
540
+ "vision_tower.blocks.2.3.spatial_block.window_attn.norm.weight": "model-00002-of-00002.safetensors",
541
+ "vision_tower.blocks.2.4.channel_block.channel_attn.fn.proj.bias": "model-00002-of-00002.safetensors",
542
+ "vision_tower.blocks.2.4.channel_block.channel_attn.fn.proj.weight": "model-00002-of-00002.safetensors",
543
+ "vision_tower.blocks.2.4.channel_block.channel_attn.fn.qkv.bias": "model-00002-of-00002.safetensors",
544
+ "vision_tower.blocks.2.4.channel_block.channel_attn.fn.qkv.weight": "model-00002-of-00002.safetensors",
545
+ "vision_tower.blocks.2.4.channel_block.channel_attn.norm.bias": "model-00002-of-00002.safetensors",
546
+ "vision_tower.blocks.2.4.channel_block.channel_attn.norm.weight": "model-00002-of-00002.safetensors",
547
+ "vision_tower.blocks.2.4.channel_block.conv1.fn.dw.bias": "model-00002-of-00002.safetensors",
548
+ "vision_tower.blocks.2.4.channel_block.conv1.fn.dw.weight": "model-00002-of-00002.safetensors",
549
+ "vision_tower.blocks.2.4.channel_block.conv2.fn.dw.bias": "model-00002-of-00002.safetensors",
550
+ "vision_tower.blocks.2.4.channel_block.conv2.fn.dw.weight": "model-00002-of-00002.safetensors",
551
+ "vision_tower.blocks.2.4.channel_block.ffn.fn.net.fc1.bias": "model-00002-of-00002.safetensors",
552
+ "vision_tower.blocks.2.4.channel_block.ffn.fn.net.fc1.weight": "model-00002-of-00002.safetensors",
553
+ "vision_tower.blocks.2.4.channel_block.ffn.fn.net.fc2.bias": "model-00002-of-00002.safetensors",
554
+ "vision_tower.blocks.2.4.channel_block.ffn.fn.net.fc2.weight": "model-00002-of-00002.safetensors",
555
+ "vision_tower.blocks.2.4.channel_block.ffn.norm.bias": "model-00002-of-00002.safetensors",
556
+ "vision_tower.blocks.2.4.channel_block.ffn.norm.weight": "model-00002-of-00002.safetensors",
557
+ "vision_tower.blocks.2.4.spatial_block.conv1.fn.dw.bias": "model-00002-of-00002.safetensors",
558
+ "vision_tower.blocks.2.4.spatial_block.conv1.fn.dw.weight": "model-00002-of-00002.safetensors",
559
+ "vision_tower.blocks.2.4.spatial_block.conv2.fn.dw.bias": "model-00002-of-00002.safetensors",
560
+ "vision_tower.blocks.2.4.spatial_block.conv2.fn.dw.weight": "model-00002-of-00002.safetensors",
561
+ "vision_tower.blocks.2.4.spatial_block.ffn.fn.net.fc1.bias": "model-00002-of-00002.safetensors",
562
+ "vision_tower.blocks.2.4.spatial_block.ffn.fn.net.fc1.weight": "model-00002-of-00002.safetensors",
563
+ "vision_tower.blocks.2.4.spatial_block.ffn.fn.net.fc2.bias": "model-00002-of-00002.safetensors",
564
+ "vision_tower.blocks.2.4.spatial_block.ffn.fn.net.fc2.weight": "model-00002-of-00002.safetensors",
565
+ "vision_tower.blocks.2.4.spatial_block.ffn.norm.bias": "model-00002-of-00002.safetensors",
566
+ "vision_tower.blocks.2.4.spatial_block.ffn.norm.weight": "model-00002-of-00002.safetensors",
567
+ "vision_tower.blocks.2.4.spatial_block.window_attn.fn.proj.bias": "model-00002-of-00002.safetensors",
568
+ "vision_tower.blocks.2.4.spatial_block.window_attn.fn.proj.weight": "model-00002-of-00002.safetensors",
569
+ "vision_tower.blocks.2.4.spatial_block.window_attn.fn.qkv.bias": "model-00002-of-00002.safetensors",
570
+ "vision_tower.blocks.2.4.spatial_block.window_attn.fn.qkv.weight": "model-00002-of-00002.safetensors",
571
+ "vision_tower.blocks.2.4.spatial_block.window_attn.norm.bias": "model-00002-of-00002.safetensors",
572
+ "vision_tower.blocks.2.4.spatial_block.window_attn.norm.weight": "model-00002-of-00002.safetensors",
573
+ "vision_tower.blocks.2.5.channel_block.channel_attn.fn.proj.bias": "model-00002-of-00002.safetensors",
574
+ "vision_tower.blocks.2.5.channel_block.channel_attn.fn.proj.weight": "model-00002-of-00002.safetensors",
575
+ "vision_tower.blocks.2.5.channel_block.channel_attn.fn.qkv.bias": "model-00002-of-00002.safetensors",
576
+ "vision_tower.blocks.2.5.channel_block.channel_attn.fn.qkv.weight": "model-00002-of-00002.safetensors",
577
+ "vision_tower.blocks.2.5.channel_block.channel_attn.norm.bias": "model-00002-of-00002.safetensors",
578
+ "vision_tower.blocks.2.5.channel_block.channel_attn.norm.weight": "model-00002-of-00002.safetensors",
579
+ "vision_tower.blocks.2.5.channel_block.conv1.fn.dw.bias": "model-00002-of-00002.safetensors",
580
+ "vision_tower.blocks.2.5.channel_block.conv1.fn.dw.weight": "model-00002-of-00002.safetensors",
581
+ "vision_tower.blocks.2.5.channel_block.conv2.fn.dw.bias": "model-00002-of-00002.safetensors",
582
+ "vision_tower.blocks.2.5.channel_block.conv2.fn.dw.weight": "model-00002-of-00002.safetensors",
583
+ "vision_tower.blocks.2.5.channel_block.ffn.fn.net.fc1.bias": "model-00002-of-00002.safetensors",
584
+ "vision_tower.blocks.2.5.channel_block.ffn.fn.net.fc1.weight": "model-00002-of-00002.safetensors",
585
+ "vision_tower.blocks.2.5.channel_block.ffn.fn.net.fc2.bias": "model-00002-of-00002.safetensors",
586
+ "vision_tower.blocks.2.5.channel_block.ffn.fn.net.fc2.weight": "model-00002-of-00002.safetensors",
587
+ "vision_tower.blocks.2.5.channel_block.ffn.norm.bias": "model-00002-of-00002.safetensors",
588
+ "vision_tower.blocks.2.5.channel_block.ffn.norm.weight": "model-00002-of-00002.safetensors",
589
+ "vision_tower.blocks.2.5.spatial_block.conv1.fn.dw.bias": "model-00002-of-00002.safetensors",
590
+ "vision_tower.blocks.2.5.spatial_block.conv1.fn.dw.weight": "model-00002-of-00002.safetensors",
591
+ "vision_tower.blocks.2.5.spatial_block.conv2.fn.dw.bias": "model-00002-of-00002.safetensors",
592
+ "vision_tower.blocks.2.5.spatial_block.conv2.fn.dw.weight": "model-00002-of-00002.safetensors",
593
+ "vision_tower.blocks.2.5.spatial_block.ffn.fn.net.fc1.bias": "model-00002-of-00002.safetensors",
594
+ "vision_tower.blocks.2.5.spatial_block.ffn.fn.net.fc1.weight": "model-00002-of-00002.safetensors",
595
+ "vision_tower.blocks.2.5.spatial_block.ffn.fn.net.fc2.bias": "model-00002-of-00002.safetensors",
596
+ "vision_tower.blocks.2.5.spatial_block.ffn.fn.net.fc2.weight": "model-00002-of-00002.safetensors",
597
+ "vision_tower.blocks.2.5.spatial_block.ffn.norm.bias": "model-00002-of-00002.safetensors",
598
+ "vision_tower.blocks.2.5.spatial_block.ffn.norm.weight": "model-00002-of-00002.safetensors",
599
+ "vision_tower.blocks.2.5.spatial_block.window_attn.fn.proj.bias": "model-00002-of-00002.safetensors",
600
+ "vision_tower.blocks.2.5.spatial_block.window_attn.fn.proj.weight": "model-00002-of-00002.safetensors",
601
+ "vision_tower.blocks.2.5.spatial_block.window_attn.fn.qkv.bias": "model-00002-of-00002.safetensors",
602
+ "vision_tower.blocks.2.5.spatial_block.window_attn.fn.qkv.weight": "model-00002-of-00002.safetensors",
603
+ "vision_tower.blocks.2.5.spatial_block.window_attn.norm.bias": "model-00002-of-00002.safetensors",
604
+ "vision_tower.blocks.2.5.spatial_block.window_attn.norm.weight": "model-00002-of-00002.safetensors",
605
+ "vision_tower.blocks.2.6.channel_block.channel_attn.fn.proj.bias": "model-00002-of-00002.safetensors",
606
+ "vision_tower.blocks.2.6.channel_block.channel_attn.fn.proj.weight": "model-00002-of-00002.safetensors",
607
+ "vision_tower.blocks.2.6.channel_block.channel_attn.fn.qkv.bias": "model-00002-of-00002.safetensors",
608
+ "vision_tower.blocks.2.6.channel_block.channel_attn.fn.qkv.weight": "model-00002-of-00002.safetensors",
609
+ "vision_tower.blocks.2.6.channel_block.channel_attn.norm.bias": "model-00002-of-00002.safetensors",
610
+ "vision_tower.blocks.2.6.channel_block.channel_attn.norm.weight": "model-00002-of-00002.safetensors",
611
+ "vision_tower.blocks.2.6.channel_block.conv1.fn.dw.bias": "model-00002-of-00002.safetensors",
612
+ "vision_tower.blocks.2.6.channel_block.conv1.fn.dw.weight": "model-00002-of-00002.safetensors",
613
+ "vision_tower.blocks.2.6.channel_block.conv2.fn.dw.bias": "model-00002-of-00002.safetensors",
614
+ "vision_tower.blocks.2.6.channel_block.conv2.fn.dw.weight": "model-00002-of-00002.safetensors",
615
+ "vision_tower.blocks.2.6.channel_block.ffn.fn.net.fc1.bias": "model-00002-of-00002.safetensors",
616
+ "vision_tower.blocks.2.6.channel_block.ffn.fn.net.fc1.weight": "model-00002-of-00002.safetensors",
617
+ "vision_tower.blocks.2.6.channel_block.ffn.fn.net.fc2.bias": "model-00002-of-00002.safetensors",
618
+ "vision_tower.blocks.2.6.channel_block.ffn.fn.net.fc2.weight": "model-00002-of-00002.safetensors",
619
+ "vision_tower.blocks.2.6.channel_block.ffn.norm.bias": "model-00002-of-00002.safetensors",
620
+ "vision_tower.blocks.2.6.channel_block.ffn.norm.weight": "model-00002-of-00002.safetensors",
621
+ "vision_tower.blocks.2.6.spatial_block.conv1.fn.dw.bias": "model-00002-of-00002.safetensors",
622
+ "vision_tower.blocks.2.6.spatial_block.conv1.fn.dw.weight": "model-00002-of-00002.safetensors",
623
+ "vision_tower.blocks.2.6.spatial_block.conv2.fn.dw.bias": "model-00002-of-00002.safetensors",
624
+ "vision_tower.blocks.2.6.spatial_block.conv2.fn.dw.weight": "model-00002-of-00002.safetensors",
625
+ "vision_tower.blocks.2.6.spatial_block.ffn.fn.net.fc1.bias": "model-00002-of-00002.safetensors",
626
+ "vision_tower.blocks.2.6.spatial_block.ffn.fn.net.fc1.weight": "model-00002-of-00002.safetensors",
627
+ "vision_tower.blocks.2.6.spatial_block.ffn.fn.net.fc2.bias": "model-00002-of-00002.safetensors",
628
+ "vision_tower.blocks.2.6.spatial_block.ffn.fn.net.fc2.weight": "model-00002-of-00002.safetensors",
629
+ "vision_tower.blocks.2.6.spatial_block.ffn.norm.bias": "model-00002-of-00002.safetensors",
630
+ "vision_tower.blocks.2.6.spatial_block.ffn.norm.weight": "model-00002-of-00002.safetensors",
631
+ "vision_tower.blocks.2.6.spatial_block.window_attn.fn.proj.bias": "model-00002-of-00002.safetensors",
632
+ "vision_tower.blocks.2.6.spatial_block.window_attn.fn.proj.weight": "model-00002-of-00002.safetensors",
633
+ "vision_tower.blocks.2.6.spatial_block.window_attn.fn.qkv.bias": "model-00002-of-00002.safetensors",
634
+ "vision_tower.blocks.2.6.spatial_block.window_attn.fn.qkv.weight": "model-00002-of-00002.safetensors",
635
+ "vision_tower.blocks.2.6.spatial_block.window_attn.norm.bias": "model-00002-of-00002.safetensors",
636
+ "vision_tower.blocks.2.6.spatial_block.window_attn.norm.weight": "model-00002-of-00002.safetensors",
637
+ "vision_tower.blocks.2.7.channel_block.channel_attn.fn.proj.bias": "model-00002-of-00002.safetensors",
638
+ "vision_tower.blocks.2.7.channel_block.channel_attn.fn.proj.weight": "model-00002-of-00002.safetensors",
639
+ "vision_tower.blocks.2.7.channel_block.channel_attn.fn.qkv.bias": "model-00002-of-00002.safetensors",
640
+ "vision_tower.blocks.2.7.channel_block.channel_attn.fn.qkv.weight": "model-00002-of-00002.safetensors",
641
+ "vision_tower.blocks.2.7.channel_block.channel_attn.norm.bias": "model-00002-of-00002.safetensors",
642
+ "vision_tower.blocks.2.7.channel_block.channel_attn.norm.weight": "model-00002-of-00002.safetensors",
643
+ "vision_tower.blocks.2.7.channel_block.conv1.fn.dw.bias": "model-00002-of-00002.safetensors",
644
+ "vision_tower.blocks.2.7.channel_block.conv1.fn.dw.weight": "model-00002-of-00002.safetensors",
645
+ "vision_tower.blocks.2.7.channel_block.conv2.fn.dw.bias": "model-00002-of-00002.safetensors",
646
+ "vision_tower.blocks.2.7.channel_block.conv2.fn.dw.weight": "model-00002-of-00002.safetensors",
647
+ "vision_tower.blocks.2.7.channel_block.ffn.fn.net.fc1.bias": "model-00002-of-00002.safetensors",
648
+ "vision_tower.blocks.2.7.channel_block.ffn.fn.net.fc1.weight": "model-00002-of-00002.safetensors",
649
+ "vision_tower.blocks.2.7.channel_block.ffn.fn.net.fc2.bias": "model-00002-of-00002.safetensors",
650
+ "vision_tower.blocks.2.7.channel_block.ffn.fn.net.fc2.weight": "model-00002-of-00002.safetensors",
651
+ "vision_tower.blocks.2.7.channel_block.ffn.norm.bias": "model-00002-of-00002.safetensors",
652
+ "vision_tower.blocks.2.7.channel_block.ffn.norm.weight": "model-00002-of-00002.safetensors",
653
+ "vision_tower.blocks.2.7.spatial_block.conv1.fn.dw.bias": "model-00002-of-00002.safetensors",
654
+ "vision_tower.blocks.2.7.spatial_block.conv1.fn.dw.weight": "model-00002-of-00002.safetensors",
655
+ "vision_tower.blocks.2.7.spatial_block.conv2.fn.dw.bias": "model-00002-of-00002.safetensors",
656
+ "vision_tower.blocks.2.7.spatial_block.conv2.fn.dw.weight": "model-00002-of-00002.safetensors",
657
+ "vision_tower.blocks.2.7.spatial_block.ffn.fn.net.fc1.bias": "model-00002-of-00002.safetensors",
658
+ "vision_tower.blocks.2.7.spatial_block.ffn.fn.net.fc1.weight": "model-00002-of-00002.safetensors",
659
+ "vision_tower.blocks.2.7.spatial_block.ffn.fn.net.fc2.bias": "model-00002-of-00002.safetensors",
660
+ "vision_tower.blocks.2.7.spatial_block.ffn.fn.net.fc2.weight": "model-00002-of-00002.safetensors",
661
+ "vision_tower.blocks.2.7.spatial_block.ffn.norm.bias": "model-00002-of-00002.safetensors",
662
+ "vision_tower.blocks.2.7.spatial_block.ffn.norm.weight": "model-00002-of-00002.safetensors",
663
+ "vision_tower.blocks.2.7.spatial_block.window_attn.fn.proj.bias": "model-00002-of-00002.safetensors",
664
+ "vision_tower.blocks.2.7.spatial_block.window_attn.fn.proj.weight": "model-00002-of-00002.safetensors",
665
+ "vision_tower.blocks.2.7.spatial_block.window_attn.fn.qkv.bias": "model-00002-of-00002.safetensors",
666
+ "vision_tower.blocks.2.7.spatial_block.window_attn.fn.qkv.weight": "model-00002-of-00002.safetensors",
667
+ "vision_tower.blocks.2.7.spatial_block.window_attn.norm.bias": "model-00002-of-00002.safetensors",
668
+ "vision_tower.blocks.2.7.spatial_block.window_attn.norm.weight": "model-00002-of-00002.safetensors",
669
+ "vision_tower.blocks.2.8.channel_block.channel_attn.fn.proj.bias": "model-00002-of-00002.safetensors",
670
+ "vision_tower.blocks.2.8.channel_block.channel_attn.fn.proj.weight": "model-00002-of-00002.safetensors",
671
+ "vision_tower.blocks.2.8.channel_block.channel_attn.fn.qkv.bias": "model-00002-of-00002.safetensors",
672
+ "vision_tower.blocks.2.8.channel_block.channel_attn.fn.qkv.weight": "model-00002-of-00002.safetensors",
673
+ "vision_tower.blocks.2.8.channel_block.channel_attn.norm.bias": "model-00002-of-00002.safetensors",
674
+ "vision_tower.blocks.2.8.channel_block.channel_attn.norm.weight": "model-00002-of-00002.safetensors",
675
+ "vision_tower.blocks.2.8.channel_block.conv1.fn.dw.bias": "model-00002-of-00002.safetensors",
676
+ "vision_tower.blocks.2.8.channel_block.conv1.fn.dw.weight": "model-00002-of-00002.safetensors",
677
+ "vision_tower.blocks.2.8.channel_block.conv2.fn.dw.bias": "model-00002-of-00002.safetensors",
678
+ "vision_tower.blocks.2.8.channel_block.conv2.fn.dw.weight": "model-00002-of-00002.safetensors",
679
+ "vision_tower.blocks.2.8.channel_block.ffn.fn.net.fc1.bias": "model-00002-of-00002.safetensors",
680
+ "vision_tower.blocks.2.8.channel_block.ffn.fn.net.fc1.weight": "model-00002-of-00002.safetensors",
681
+ "vision_tower.blocks.2.8.channel_block.ffn.fn.net.fc2.bias": "model-00002-of-00002.safetensors",
682
+ "vision_tower.blocks.2.8.channel_block.ffn.fn.net.fc2.weight": "model-00002-of-00002.safetensors",
683
+ "vision_tower.blocks.2.8.channel_block.ffn.norm.bias": "model-00002-of-00002.safetensors",
684
+ "vision_tower.blocks.2.8.channel_block.ffn.norm.weight": "model-00002-of-00002.safetensors",
685
+ "vision_tower.blocks.2.8.spatial_block.conv1.fn.dw.bias": "model-00002-of-00002.safetensors",
686
+ "vision_tower.blocks.2.8.spatial_block.conv1.fn.dw.weight": "model-00002-of-00002.safetensors",
687
+ "vision_tower.blocks.2.8.spatial_block.conv2.fn.dw.bias": "model-00002-of-00002.safetensors",
688
+ "vision_tower.blocks.2.8.spatial_block.conv2.fn.dw.weight": "model-00002-of-00002.safetensors",
689
+ "vision_tower.blocks.2.8.spatial_block.ffn.fn.net.fc1.bias": "model-00002-of-00002.safetensors",
690
+ "vision_tower.blocks.2.8.spatial_block.ffn.fn.net.fc1.weight": "model-00002-of-00002.safetensors",
691
+ "vision_tower.blocks.2.8.spatial_block.ffn.fn.net.fc2.bias": "model-00002-of-00002.safetensors",
692
+ "vision_tower.blocks.2.8.spatial_block.ffn.fn.net.fc2.weight": "model-00002-of-00002.safetensors",
693
+ "vision_tower.blocks.2.8.spatial_block.ffn.norm.bias": "model-00002-of-00002.safetensors",
694
+ "vision_tower.blocks.2.8.spatial_block.ffn.norm.weight": "model-00002-of-00002.safetensors",
695
+ "vision_tower.blocks.2.8.spatial_block.window_attn.fn.proj.bias": "model-00002-of-00002.safetensors",
696
+ "vision_tower.blocks.2.8.spatial_block.window_attn.fn.proj.weight": "model-00002-of-00002.safetensors",
697
+ "vision_tower.blocks.2.8.spatial_block.window_attn.fn.qkv.bias": "model-00002-of-00002.safetensors",
698
+ "vision_tower.blocks.2.8.spatial_block.window_attn.fn.qkv.weight": "model-00002-of-00002.safetensors",
699
+ "vision_tower.blocks.2.8.spatial_block.window_attn.norm.bias": "model-00002-of-00002.safetensors",
700
+ "vision_tower.blocks.2.8.spatial_block.window_attn.norm.weight": "model-00002-of-00002.safetensors",
701
+ "vision_tower.blocks.3.0.channel_block.channel_attn.fn.proj.bias": "model-00002-of-00002.safetensors",
702
+ "vision_tower.blocks.3.0.channel_block.channel_attn.fn.proj.weight": "model-00002-of-00002.safetensors",
703
+ "vision_tower.blocks.3.0.channel_block.channel_attn.fn.qkv.bias": "model-00002-of-00002.safetensors",
704
+ "vision_tower.blocks.3.0.channel_block.channel_attn.fn.qkv.weight": "model-00002-of-00002.safetensors",
705
+ "vision_tower.blocks.3.0.channel_block.channel_attn.norm.bias": "model-00002-of-00002.safetensors",
706
+ "vision_tower.blocks.3.0.channel_block.channel_attn.norm.weight": "model-00002-of-00002.safetensors",
707
+ "vision_tower.blocks.3.0.channel_block.conv1.fn.dw.bias": "model-00002-of-00002.safetensors",
708
+ "vision_tower.blocks.3.0.channel_block.conv1.fn.dw.weight": "model-00002-of-00002.safetensors",
709
+ "vision_tower.blocks.3.0.channel_block.conv2.fn.dw.bias": "model-00002-of-00002.safetensors",
710
+ "vision_tower.blocks.3.0.channel_block.conv2.fn.dw.weight": "model-00002-of-00002.safetensors",
711
+ "vision_tower.blocks.3.0.channel_block.ffn.fn.net.fc1.bias": "model-00002-of-00002.safetensors",
712
+ "vision_tower.blocks.3.0.channel_block.ffn.fn.net.fc1.weight": "model-00002-of-00002.safetensors",
713
+ "vision_tower.blocks.3.0.channel_block.ffn.fn.net.fc2.bias": "model-00002-of-00002.safetensors",
714
+ "vision_tower.blocks.3.0.channel_block.ffn.fn.net.fc2.weight": "model-00002-of-00002.safetensors",
715
+ "vision_tower.blocks.3.0.channel_block.ffn.norm.bias": "model-00002-of-00002.safetensors",
716
+ "vision_tower.blocks.3.0.channel_block.ffn.norm.weight": "model-00002-of-00002.safetensors",
717
+ "vision_tower.blocks.3.0.spatial_block.conv1.fn.dw.bias": "model-00002-of-00002.safetensors",
718
+ "vision_tower.blocks.3.0.spatial_block.conv1.fn.dw.weight": "model-00002-of-00002.safetensors",
719
+ "vision_tower.blocks.3.0.spatial_block.conv2.fn.dw.bias": "model-00002-of-00002.safetensors",
720
+ "vision_tower.blocks.3.0.spatial_block.conv2.fn.dw.weight": "model-00002-of-00002.safetensors",
721
+ "vision_tower.blocks.3.0.spatial_block.ffn.fn.net.fc1.bias": "model-00002-of-00002.safetensors",
722
+ "vision_tower.blocks.3.0.spatial_block.ffn.fn.net.fc1.weight": "model-00002-of-00002.safetensors",
723
+ "vision_tower.blocks.3.0.spatial_block.ffn.fn.net.fc2.bias": "model-00002-of-00002.safetensors",
724
+ "vision_tower.blocks.3.0.spatial_block.ffn.fn.net.fc2.weight": "model-00002-of-00002.safetensors",
725
+ "vision_tower.blocks.3.0.spatial_block.ffn.norm.bias": "model-00002-of-00002.safetensors",
726
+ "vision_tower.blocks.3.0.spatial_block.ffn.norm.weight": "model-00002-of-00002.safetensors",
727
+ "vision_tower.blocks.3.0.spatial_block.window_attn.fn.proj.bias": "model-00002-of-00002.safetensors",
728
+ "vision_tower.blocks.3.0.spatial_block.window_attn.fn.proj.weight": "model-00002-of-00002.safetensors",
729
+ "vision_tower.blocks.3.0.spatial_block.window_attn.fn.qkv.bias": "model-00002-of-00002.safetensors",
730
+ "vision_tower.blocks.3.0.spatial_block.window_attn.fn.qkv.weight": "model-00002-of-00002.safetensors",
731
+ "vision_tower.blocks.3.0.spatial_block.window_attn.norm.bias": "model-00002-of-00002.safetensors",
732
+ "vision_tower.blocks.3.0.spatial_block.window_attn.norm.weight": "model-00002-of-00002.safetensors",
733
+ "vision_tower.convs.0.norm.bias": "model-00002-of-00002.safetensors",
734
+ "vision_tower.convs.0.norm.weight": "model-00002-of-00002.safetensors",
735
+ "vision_tower.convs.0.proj.bias": "model-00002-of-00002.safetensors",
736
+ "vision_tower.convs.0.proj.weight": "model-00002-of-00002.safetensors",
737
+ "vision_tower.convs.1.norm.bias": "model-00002-of-00002.safetensors",
738
+ "vision_tower.convs.1.norm.weight": "model-00002-of-00002.safetensors",
739
+ "vision_tower.convs.1.proj.bias": "model-00002-of-00002.safetensors",
740
+ "vision_tower.convs.1.proj.weight": "model-00002-of-00002.safetensors",
741
+ "vision_tower.convs.2.norm.bias": "model-00002-of-00002.safetensors",
742
+ "vision_tower.convs.2.norm.weight": "model-00002-of-00002.safetensors",
743
+ "vision_tower.convs.2.proj.bias": "model-00002-of-00002.safetensors",
744
+ "vision_tower.convs.2.proj.weight": "model-00002-of-00002.safetensors",
745
+ "vision_tower.convs.3.norm.bias": "model-00002-of-00002.safetensors",
746
+ "vision_tower.convs.3.norm.weight": "model-00002-of-00002.safetensors",
747
+ "vision_tower.convs.3.proj.bias": "model-00002-of-00002.safetensors",
748
+ "vision_tower.convs.3.proj.weight": "model-00002-of-00002.safetensors",
749
+ "visual_temporal_embed.pos_idx_to_embed": "model-00002-of-00002.safetensors"
750
+ }
751
+ }
modeling_florence2.py ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_florence2.Florence2Processor"
4
+ },
5
+ "_valid_processor_keys": [
6
+ "images",
7
+ "do_resize",
8
+ "size",
9
+ "resample",
10
+ "do_rescale",
11
+ "rescale_factor",
12
+ "do_normalize",
13
+ "image_mean",
14
+ "image_std",
15
+ "return_tensors",
16
+ "data_format",
17
+ "input_data_format",
18
+ "do_convert_rgb"
19
+ ],
20
+ "do_convert_rgb": null,
21
+ "do_normalize": true,
22
+ "do_rescale": true,
23
+ "do_resize": true,
24
+ "do_center_crop": false,
25
+ "image_processor_type": "CLIPImageProcessor",
26
+ "image_seq_length": 577,
27
+ "image_mean": [0.485, 0.456, 0.406],
28
+ "image_std": [0.229, 0.224, 0.225],
29
+ "processor_class": "Florence2Processor",
30
+ "resample": 3,
31
+ "size": {
32
+ "height": 768,
33
+ "width":768
34
+ },
35
+ "crop_size": {
36
+ "height": 768,
37
+ "width": 768
38
+ }
39
+ }
processing_florence2.py ADDED
@@ -0,0 +1,1135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 Microsoft and The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """
16
+ Processor class for Florence-2.
17
+ """
18
+
19
+ import re
20
+ import logging
21
+ from typing import List, Optional, Union
22
+ import numpy as np
23
+
24
+ import torch
25
+
26
+ from transformers.feature_extraction_utils import BatchFeature
27
+ from transformers.image_utils import ImageInput, is_valid_image
28
+ from transformers.processing_utils import ProcessorMixin
29
+ from transformers.tokenization_utils_base import (
30
+ PaddingStrategy,
31
+ PreTokenizedInput,
32
+ TextInput,
33
+ TruncationStrategy,
34
+ )
35
+ from transformers.utils import TensorType
36
+
37
+ IMAGE_TOKEN_INDEX = -200
38
+
39
+ logger = logging.getLogger(__name__)
40
+
41
+ # Copied from transformers.models.idefics2.processing_idefics2.is_url
42
+ def is_url(val) -> bool:
43
+ return isinstance(val, str) and val.startswith("http")
44
+
45
+ # Copied from transformers.models.idefics2.processing_idefics2.is_image_or_image_url
46
+ def is_image_or_image_url(elem):
47
+ return is_url(elem) or is_valid_image(elem)
48
+
49
+
50
+ def _is_str_or_image(elem):
51
+ return isinstance(elem, (str)) or is_image_or_image_url(elem)
52
+
53
+
54
+ class Florence2Processor(ProcessorMixin):
55
+ r"""
56
+ Constructs a Florence2 processor which wraps a Florence2 image processor and a Florence2 tokenizer into a single processor.
57
+
58
+ [`Florence2Processor`] offers all the functionalities of [`CLIPImageProcessor`] and [`BartTokenizerFast`]. See the
59
+ [`~Florence2Processor.__call__`] and [`~Florence2Processor.decode`] for more information.
60
+
61
+ Args:
62
+ image_processor ([`CLIPImageProcessor`], *optional*):
63
+ The image processor is a required input.
64
+ tokenizer ([`BartTokenizerFast`], *optional*):
65
+ The tokenizer is a required input.
66
+ """
67
+
68
+ attributes = ["image_processor", "tokenizer"]
69
+ image_processor_class = "CLIPImageProcessor"
70
+ tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
71
+
72
+ def __init__(
73
+ self,
74
+ image_processor=None,
75
+ tokenizer=None,
76
+ ):
77
+ if image_processor is None:
78
+ raise ValueError("You need to specify an `image_processor`.")
79
+ if tokenizer is None:
80
+ raise ValueError("You need to specify a `tokenizer`.")
81
+ if not hasattr(image_processor, "image_seq_length"):
82
+ raise ValueError("Image processor is missing an `image_seq_length` attribute.")
83
+
84
+ self.image_seq_length = image_processor.image_seq_length
85
+
86
+ tokens_to_add = {
87
+ 'additional_special_tokens': \
88
+ tokenizer.additional_special_tokens + \
89
+ ['<od>', '</od>', '<ocr>', '</ocr>'] + \
90
+ [f'<loc_{x}>' for x in range(1000)] + \
91
+ ['<cap>', '</cap>', '<ncap>', '</ncap>','<dcap>', '</dcap>', '<grounding>', '</grounding>', '<seg>', '</seg>', '<sep>', '<region_cap>', '</region_cap>', '<region_to_desciption>', '</region_to_desciption>', '<proposal>', '</proposal>', '<poly>', '</poly>', '<and>']
92
+ }
93
+ tokenizer.add_special_tokens(tokens_to_add)
94
+
95
+ self.tasks_answer_post_processing_type = {
96
+ '<OCR>': 'pure_text',
97
+ '<OCR_TABLE_MD>': 'pure_text',
98
+ '<OCR_WITH_REGION>': 'ocr',
99
+ '<CAPTION>': 'pure_text',
100
+ '<CAPTION_CN>': 'pure_text',
101
+ '<DETAILED_CAPTION>': 'pure_text',
102
+ '<DETAILED_CAPTION_CN>': 'pure_text',
103
+ '<MORE_DETAILED_CAPTION>': 'pure_text',
104
+ '<MORE_DETAILED_CAPTION_CN>': 'pure_text',
105
+ '<OD>': 'description_with_bboxes',
106
+ '<DENSE_REGION_CAPTION>': 'description_with_bboxes',
107
+ '<CAPTION_TO_PHRASE_GROUNDING>': "phrase_grounding",
108
+ '<REFERRING_EXPRESSION_SEGMENTATION>': 'polygons',
109
+ '<REGION_TO_SEGMENTATION>': 'polygons',
110
+ '<OPEN_VOCABULARY_DETECTION>': 'description_with_bboxes_or_polygons',
111
+ '<REGION_TO_CATEGORY>': 'pure_text',
112
+ '<REGION_TO_DESCRIPTION>': 'pure_text',
113
+ '<REGION_TO_OCR>': 'pure_text',
114
+ '<REGION_PROPOSAL>': 'bboxes'
115
+ }
116
+
117
+ self.task_prompts_without_inputs = {
118
+ '<OCR>': 'What is the text in the image?',
119
+ '<OCR_WITH_REGION>': 'What is the text in the image, with regions?',
120
+ '<OCR_TABLE_MD>': 'Convert the image into markdown style reconstructed table in text.',
121
+ '<CAPTION>': 'What does the image describe?',
122
+ '<CAPTION_CN>': '图中描述了什么?',
123
+ '<DETAILED_CAPTION>': 'Describe in detail what is shown in the image.',
124
+ '<DETAILED_CAPTION_CN>': '详细描述图像中所展示的内容。',
125
+ '<MORE_DETAILED_CAPTION>': 'Describe with a paragraph what is shown in the image.',
126
+ '<MORE_DETAILED_CAPTION_CN>': '用一段话描述图像中所展示的内容。',
127
+ '<OD>': 'Locate the objects with category name in the image.',
128
+ '<DENSE_REGION_CAPTION>': 'Locate the objects in the image, with their descriptions.',
129
+ '<REGION_PROPOSAL>': 'Locate the region proposals in the image.'
130
+ }
131
+
132
+ self.task_prompts_with_input = {
133
+ '<CAPTION_TO_PHRASE_GROUNDING>': "Locate the phrases in the caption: {input}",
134
+ '<REFERRING_EXPRESSION_SEGMENTATION>': 'Locate {input} in the image with mask',
135
+ '<REGION_TO_SEGMENTATION>': 'What is the polygon mask of region {input}',
136
+ '<OPEN_VOCABULARY_DETECTION>': 'Locate {input} in the image.',
137
+ '<REGION_TO_CATEGORY>': 'What is the region {input}?',
138
+ '<REGION_TO_DESCRIPTION>': 'What does the region {input} describe?',
139
+ '<REGION_TO_OCR>': 'What text is in the region {input}?',
140
+ }
141
+
142
+ self.post_processor = Florence2PostProcesser(tokenizer=tokenizer)
143
+
144
+
145
+ super().__init__(image_processor, tokenizer)
146
+
147
+ def _construct_prompts(self, text):
148
+ # replace the task tokens with the task prompts if task token is in the text
149
+ sys_prompt = '<|im_start|>system\nYou are a helpful AI assistant<|im_end|>\n'
150
+ prompts = []
151
+ for _text in text:
152
+ if _text in self.task_prompts_without_inputs.keys():
153
+ # 1. fixed task prompts without additional inputs
154
+ for task_token, task_prompt in self.task_prompts_without_inputs.items():
155
+ if task_token in _text:
156
+ assert _text == task_token, f"Task token {task_token} should be the only token in the text."
157
+ _text = task_prompt
158
+ break
159
+ # 2. task prompts with additional inputs
160
+ for task_token, task_prompt in self.task_prompts_with_input.items():
161
+ if task_token in _text:
162
+ _text = task_prompt.format(input=_text.replace(task_token, ''))
163
+ break
164
+ # add prompt template here
165
+ _text = f'{sys_prompt}<|im_start|>user\n<image>\n{_text}<|im_end|>\n<|im_start|>assistant:\n'
166
+ else:
167
+ _text = f'{sys_prompt}<|im_start|>user\n{_text}<|im_end|>\n<|im_start|>assistant:\n'
168
+ prompts.append(_text)
169
+ return prompts
170
+
171
+ def tokenizer_image_token(self, prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None, add_special_tokens=True):
172
+ prompt_chunks = [tokenizer(chunk, add_special_tokens=add_special_tokens).input_ids for chunk in prompt.split('<image>')]
173
+
174
+ def insert_separator(X, sep):
175
+ return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
176
+
177
+ input_ids = []
178
+ offset = 0
179
+ if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
180
+ offset = 1
181
+ input_ids.append(prompt_chunks[0][0])
182
+
183
+ for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
184
+ input_ids.extend(x[offset:])
185
+
186
+ if return_tensors is not None:
187
+ if return_tensors == 'pt':
188
+ return torch.tensor(input_ids, dtype=torch.long)
189
+ raise ValueError(f'Unsupported tensor type: {return_tensors}')
190
+ return input_ids
191
+
192
+ def __call__(
193
+ self,
194
+ text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
195
+ images: ImageInput = None,
196
+ tokenize_newline_separately: bool = True,
197
+ padding: Union[bool, str, PaddingStrategy] = False,
198
+ truncation: Union[bool, str, TruncationStrategy] = None,
199
+ max_length=None,
200
+ return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
201
+ do_resize: bool = None,
202
+ do_normalize: bool = None,
203
+ image_mean: Optional[Union[float, List[float]]] = None,
204
+ image_std: Optional[Union[float, List[float]]] = None,
205
+ data_format: Optional["ChannelDimension"] = "channels_first", # noqa: F821
206
+ input_data_format: Optional[
207
+ Union[str, "ChannelDimension"] # noqa: F821
208
+ ] = None,
209
+ resample: "PILImageResampling" = None, # noqa: F821
210
+ size=None,
211
+ do_convert_rgb: bool = None,
212
+ do_thumbnail: bool = None,
213
+ do_align_long_axis: bool = None,
214
+ do_rescale: bool = None,
215
+ ) -> BatchFeature:
216
+ """
217
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
218
+ and `kwargs` arguments to BartTokenizerFast's [`~BartTokenizerFast.__call__`] if `text` is not `None` to encode
219
+ the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
220
+ CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
221
+ of the above two methods for more information.
222
+
223
+ Args:
224
+ text (`str`, `List[str]`, `List[List[str]]`):
225
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
226
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
227
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
228
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
229
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
230
+ tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
231
+ number of channels, H and W are image height and width.
232
+ tokenize_newline_separately (`bool`, defaults to `True`):
233
+ Adds a separately tokenized '\n' at the end of the prompt.
234
+ padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
235
+ Select a strategy to pad the returned sequences (according to the model's padding side and padding
236
+ index) among:
237
+ - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
238
+ sequence if provided).
239
+ - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
240
+ acceptable input length for the model if that argument is not provided.
241
+ - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
242
+ lengths).
243
+ max_length (`int`, *optional*):
244
+ Maximum length of the returned list and optionally padding length (see above).
245
+ truncation (`bool`, *optional*):
246
+ Activates truncation to cut input sequences longer than `max_length` to `max_length`.
247
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
248
+ If set, will return tensors of a particular framework. Acceptable values are:
249
+
250
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
251
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
252
+ - `'np'`: Return NumPy `np.ndarray` objects.
253
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
254
+
255
+ Returns:
256
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
257
+
258
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. If `suffix`
259
+ is provided, the `input_ids` will also contain the suffix input ids.
260
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
261
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
262
+ `None`).
263
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
264
+ - **labels** -- Labels compatible with training if `suffix` is not None
265
+ """
266
+
267
+ return_token_type_ids = False
268
+
269
+ if images is None:
270
+ raise ValueError("`images` are expected as arguments to a `Florence2Processor` instance.")
271
+ if text is None:
272
+ logger.warning_once(
273
+ "You are using Florence-2 without a text prompt."
274
+ )
275
+ text = ""
276
+
277
+ if isinstance(text, List) and isinstance(images, List):
278
+ if len(images) < len(text):
279
+ raise ValueError(
280
+ f"Received {len(images)} images for {len(text)} prompts. Each prompt should be associated with an image."
281
+ )
282
+ if _is_str_or_image(text):
283
+ text = [text]
284
+ elif isinstance(text, list) and _is_str_or_image(text[0]):
285
+ pass
286
+
287
+ pixel_values = self.image_processor(
288
+ images,
289
+ size=size,
290
+ do_resize=do_resize,
291
+ do_normalize=do_normalize,
292
+ return_tensors=return_tensors,
293
+ image_mean=image_mean,
294
+ image_std=image_std,
295
+ input_data_format=input_data_format,
296
+ data_format=data_format,
297
+ resample=resample,
298
+ do_convert_rgb=do_convert_rgb,
299
+ )["pixel_values"]
300
+
301
+ if max_length is not None:
302
+ max_length -= self.image_seq_length # max_length has to account for the image tokens
303
+
304
+ text = self._construct_prompts(text)
305
+
306
+ # inputs = self.tokenizer(
307
+ # text,
308
+ # return_tensors=return_tensors,
309
+ # padding=padding,
310
+ # max_length=max_length,
311
+ # truncation=truncation,
312
+ # return_token_type_ids=return_token_type_ids,
313
+ # )
314
+ print(text)
315
+ inputs = []
316
+ for t in text:
317
+ tokenized = self.tokenizer_image_token(t, self.tokenizer)
318
+ tokenized_tensor = torch.tensor(tokenized, dtype=torch.long)
319
+ inputs.append(tokenized_tensor)
320
+ inputs = torch.nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=self.tokenizer.pad_token_id)
321
+ inputs = {'input_ids': inputs}
322
+ # print(inputs)
323
+
324
+ return_data = {**inputs, "pixel_values": pixel_values}
325
+
326
+ if return_token_type_ids:
327
+ labels = inputs["input_ids"].masked_fill(inputs["token_type_ids"] == 0, -100)
328
+ return_data.update({"labels": labels})
329
+ return BatchFeature(data=return_data)
330
+
331
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Florence2
332
+ def batch_decode(self, *args, **kwargs):
333
+ """
334
+ This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
335
+ refer to the docstring of this method for more information.
336
+ """
337
+ return self.tokenizer.batch_decode(*args, **kwargs)
338
+
339
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Florence2
340
+ def decode(self, *args, **kwargs):
341
+ """
342
+ This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
343
+ the docstring of this method for more information.
344
+ """
345
+ return self.tokenizer.decode(*args, **kwargs)
346
+
347
+ @property
348
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names with CLIP->Florence2
349
+ def model_input_names(self):
350
+ tokenizer_input_names = self.tokenizer.model_input_names
351
+ image_processor_input_names = self.image_processor.model_input_names
352
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
353
+
354
+ def post_process_generation(self, text, task, image_size):
355
+ """
356
+ Post-process the output of the model to each of the task outputs.
357
+
358
+ Args:
359
+ text (`str`): The text to post-process.
360
+ task (`str`): The task to post-process the text for.
361
+ image_size (`Tuple[int, int]`): The size of the image. height x width.
362
+ """
363
+
364
+ task_answer_post_processing_type = self.tasks_answer_post_processing_type.get(task, 'pure_text')
365
+ task_answer = self.post_processor(
366
+ text=text,
367
+ image_size=image_size,
368
+ parse_tasks=task_answer_post_processing_type,
369
+ )[task_answer_post_processing_type]
370
+
371
+ if task_answer_post_processing_type == 'pure_text':
372
+ final_answer = task_answer
373
+ # remove the special tokens
374
+ final_answer = final_answer.replace('<s>', '').replace('</s>', '\n')
375
+ elif task_answer_post_processing_type in ['od', 'description_with_bboxes', 'bboxes']:
376
+ od_instances = task_answer
377
+ bboxes_od = [_od_instance['bbox'] for _od_instance in od_instances]
378
+ labels_od = [str(_od_instance['cat_name']) for _od_instance in od_instances]
379
+ final_answer = {'bboxes': bboxes_od, 'labels': labels_od}
380
+ elif task_answer_post_processing_type in ['ocr']:
381
+ bboxes = [_od_instance['quad_box'] for _od_instance in task_answer]
382
+ labels = [str(_od_instance['text']) for _od_instance in task_answer]
383
+ final_answer = {'quad_boxes': bboxes, 'labels': labels}
384
+ elif task_answer_post_processing_type in ['phrase_grounding']:
385
+ bboxes = []
386
+ labels = []
387
+ for _grounded_phrase in task_answer:
388
+ for _bbox in _grounded_phrase['bbox']:
389
+ bboxes.append(_bbox)
390
+ labels.append(_grounded_phrase['cat_name'])
391
+ final_answer = {'bboxes': bboxes, 'labels': labels}
392
+ elif task_answer_post_processing_type in ['description_with_polygons', 'polygons']:
393
+ labels = []
394
+ polygons = []
395
+ for result in task_answer:
396
+ label = result['cat_name']
397
+ _polygons = result['polygons']
398
+ labels.append(label)
399
+ polygons.append(_polygons)
400
+ final_answer = {'polygons': polygons, 'labels': labels}
401
+ elif task_answer_post_processing_type in ['description_with_bboxes_or_polygons']:
402
+ bboxes = []
403
+ bboxes_labels = []
404
+ polygons = []
405
+ polygons_labels = []
406
+ for result in task_answer:
407
+ label = result['cat_name']
408
+ if 'polygons' in result:
409
+ _polygons = result['polygons']
410
+ polygons.append(_polygons)
411
+ polygons_labels.append(label)
412
+ else:
413
+ _bbox = result['bbox']
414
+ bboxes.append(_bbox)
415
+ bboxes_labels.append(label)
416
+ final_answer = {'bboxes': bboxes, 'bboxes_labels': bboxes_labels, 'polygons': polygons, 'polygons_labels': polygons_labels}
417
+ else:
418
+ raise ValueError('Unknown task answer post processing type: {}'.format(task_answer_post_processing_type))
419
+
420
+ final_answer = {
421
+ task: final_answer}
422
+ return final_answer
423
+
424
+ class BoxQuantizer(object):
425
+ def __init__(self, mode, bins):
426
+ self.mode = mode
427
+ self.bins = bins
428
+
429
+ def quantize(self, boxes: torch.Tensor, size):
430
+ bins_w, bins_h = self.bins # Quantization bins.
431
+ size_w, size_h = size # Original image size.
432
+ size_per_bin_w = size_w / bins_w
433
+ size_per_bin_h = size_h / bins_h
434
+ xmin, ymin, xmax, ymax = boxes.split(1, dim=-1) # Shape: 4 * [N, 1].
435
+
436
+ if self.mode == 'floor':
437
+ quantized_xmin = (
438
+ xmin / size_per_bin_w).floor().clamp(0, bins_w - 1)
439
+ quantized_ymin = (
440
+ ymin / size_per_bin_h).floor().clamp(0, bins_h - 1)
441
+ quantized_xmax = (
442
+ xmax / size_per_bin_w).floor().clamp(0, bins_w - 1)
443
+ quantized_ymax = (
444
+ ymax / size_per_bin_h).floor().clamp(0, bins_h - 1)
445
+
446
+ elif self.mode == 'round':
447
+ raise NotImplementedError()
448
+
449
+ else:
450
+ raise ValueError('Incorrect quantization type.')
451
+
452
+ quantized_boxes = torch.cat(
453
+ (quantized_xmin, quantized_ymin, quantized_xmax, quantized_ymax), dim=-1
454
+ ).int()
455
+
456
+ return quantized_boxes
457
+
458
+ def dequantize(self, boxes: torch.Tensor, size):
459
+ bins_w, bins_h = self.bins # Quantization bins.
460
+ size_w, size_h = size # Original image size.
461
+ size_per_bin_w = size_w / bins_w
462
+ size_per_bin_h = size_h / bins_h
463
+ xmin, ymin, xmax, ymax = boxes.split(1, dim=-1) # Shape: 4 * [N, 1].
464
+
465
+ if self.mode == 'floor':
466
+ # Add 0.5 to use the center position of the bin as the coordinate.
467
+ dequantized_xmin = (xmin + 0.5) * size_per_bin_w
468
+ dequantized_ymin = (ymin + 0.5) * size_per_bin_h
469
+ dequantized_xmax = (xmax + 0.5) * size_per_bin_w
470
+ dequantized_ymax = (ymax + 0.5) * size_per_bin_h
471
+
472
+ elif self.mode == 'round':
473
+ raise NotImplementedError()
474
+
475
+ else:
476
+ raise ValueError('Incorrect quantization type.')
477
+
478
+ dequantized_boxes = torch.cat(
479
+ (dequantized_xmin, dequantized_ymin,
480
+ dequantized_xmax, dequantized_ymax), dim=-1
481
+ )
482
+
483
+ return dequantized_boxes
484
+
485
+
486
+ class CoordinatesQuantizer(object):
487
+ """
488
+ Quantize coornidates (Nx2)
489
+ """
490
+
491
+ def __init__(self, mode, bins):
492
+ self.mode = mode
493
+ self.bins = bins
494
+
495
+ def quantize(self, coordinates: torch.Tensor, size):
496
+ bins_w, bins_h = self.bins # Quantization bins.
497
+ size_w, size_h = size # Original image size.
498
+ size_per_bin_w = size_w / bins_w
499
+ size_per_bin_h = size_h / bins_h
500
+ assert coordinates.shape[-1] == 2, 'coordinates should be shape (N, 2)'
501
+ x, y = coordinates.split(1, dim=-1) # Shape: 4 * [N, 1].
502
+
503
+ if self.mode == 'floor':
504
+ quantized_x = (x / size_per_bin_w).floor().clamp(0, bins_w - 1)
505
+ quantized_y = (y / size_per_bin_h).floor().clamp(0, bins_h - 1)
506
+
507
+ elif self.mode == 'round':
508
+ raise NotImplementedError()
509
+
510
+ else:
511
+ raise ValueError('Incorrect quantization type.')
512
+
513
+ quantized_coordinates = torch.cat(
514
+ (quantized_x, quantized_y), dim=-1
515
+ ).int()
516
+
517
+ return quantized_coordinates
518
+
519
+ def dequantize(self, coordinates: torch.Tensor, size):
520
+ bins_w, bins_h = self.bins # Quantization bins.
521
+ size_w, size_h = size # Original image size.
522
+ size_per_bin_w = size_w / bins_w
523
+ size_per_bin_h = size_h / bins_h
524
+ assert coordinates.shape[-1] == 2, 'coordinates should be shape (N, 2)'
525
+ x, y = coordinates.split(1, dim=-1) # Shape: 4 * [N, 1].
526
+
527
+ if self.mode == 'floor':
528
+ # Add 0.5 to use the center position of the bin as the coordinate.
529
+ dequantized_x = (x + 0.5) * size_per_bin_w
530
+ dequantized_y = (y + 0.5) * size_per_bin_h
531
+
532
+ elif self.mode == 'round':
533
+ raise NotImplementedError()
534
+
535
+ else:
536
+ raise ValueError('Incorrect quantization type.')
537
+
538
+ dequantized_coordinates = torch.cat(
539
+ (dequantized_x, dequantized_y), dim=-1
540
+ )
541
+
542
+ return dequantized_coordinates
543
+
544
+
545
+ class Florence2PostProcesser(object):
546
+ """
547
+ Florence-2 post process for converting text prediction to various tasks results.
548
+
549
+ Args:
550
+ config: A dict of configs.
551
+ tokenizer: A tokenizer for decoding text to spans.
552
+ sample config:
553
+ UNIFIED_POST_PROCESS:
554
+ # commom configs
555
+ NUM_BBOX_HEIGHT_BINS: 1000
556
+ NUM_BBOX_WIDTH_BINS: 1000
557
+ COORDINATES_HEIGHT_BINS: 1000
558
+ COORDINATES_WIDTH_BINS: 1000
559
+ # task specific configs, override the common configs
560
+ PRASE_TASKS:
561
+ - TASK_NAME: 'video_dense_caption'
562
+ PATTERN: 'r<time_(\d+)><time_(\d+)>([a-zA-Z0-9 ]+)'
563
+ SCORE_MODE: 'avg_cat_name_scores'
564
+ NUM_BINS: 100
565
+ - TASK_NAME: 'od'
566
+ PATTERN: 'r<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>([a-zA-Z0-9 ]+)'
567
+ SCORE_MODE: 'avg_cat_name_scores'
568
+
569
+ Returns:
570
+ parsed_dict (dict): A dict of parsed results.
571
+ """
572
+ def __init__(
573
+ self,
574
+ tokenizer=None
575
+ ):
576
+ parse_tasks = []
577
+ parse_task_configs = {}
578
+ config = self._create_default_config()
579
+ for task in config['PARSE_TASKS']:
580
+ parse_tasks.append(task['TASK_NAME'])
581
+ parse_task_configs[task['TASK_NAME']] = task
582
+
583
+ self.config = config
584
+ self.parse_tasks = parse_tasks
585
+ self.parse_tasks_configs = parse_task_configs
586
+
587
+ self.tokenizer = tokenizer
588
+ if self.tokenizer is not None:
589
+ self.all_special_tokens = set(self.tokenizer.all_special_tokens)
590
+
591
+ self.init_quantizers()
592
+ self.black_list_of_phrase_grounding = self._create_black_list_of_phrase_grounding()
593
+
594
+ def _create_black_list_of_phrase_grounding(self):
595
+ black_list = {}
596
+
597
+ if 'phrase_grounding' in self.parse_tasks and self.parse_tasks_configs['phrase_grounding']['FILTER_BY_BLACK_LIST']:
598
+ black_list = set(
599
+ ['it', 'I', 'me', 'mine',
600
+ 'you', 'your', 'yours',
601
+ 'he', 'him', 'his',
602
+ 'she', 'her', 'hers',
603
+ 'they', 'them', 'their', 'theirs',
604
+ 'one', 'oneself',
605
+ 'we', 'us', 'our', 'ours',
606
+ 'you', 'your', 'yours',
607
+ 'they', 'them', 'their', 'theirs',
608
+ 'mine', 'yours', 'his', 'hers', 'its',
609
+ 'ours', 'yours', 'theirs',
610
+ 'myself', 'yourself', 'himself', 'herself', 'itself',
611
+ 'ourselves', 'yourselves', 'themselves',
612
+ 'this', 'that',
613
+ 'these', 'those',
614
+ 'who', 'whom', 'whose', 'which', 'what',
615
+ 'who', 'whom', 'whose', 'which', 'that',
616
+ 'all', 'another', 'any', 'anybody', 'anyone', 'anything',
617
+ 'each', 'everybody', 'everyone', 'everything',
618
+ 'few', 'many', 'nobody', 'none', 'one', 'several',
619
+ 'some', 'somebody', 'someone', 'something',
620
+ 'each other', 'one another',
621
+ 'myself', 'yourself', 'himself', 'herself', 'itself',
622
+ 'ourselves', 'yourselves', 'themselves',
623
+ 'the image', 'image', 'images', 'the', 'a', 'an', 'a group',
624
+ 'other objects', 'lots', 'a set',
625
+ ]
626
+ )
627
+
628
+ return black_list
629
+
630
+ def _create_default_config(self):
631
+ config = {
632
+ 'NUM_BBOX_HEIGHT_BINS': 1000,
633
+ 'NUM_BBOX_WIDTH_BINS': 1000,
634
+ 'BOX_QUANTIZATION_MODE': 'floor',
635
+ 'COORDINATES_HEIGHT_BINS': 1000,
636
+ 'COORDINATES_WIDTH_BINS': 1000,
637
+ 'COORDINATES_QUANTIZATION_MODE': 'floor',
638
+ 'PARSE_TASKS': [
639
+ {
640
+ 'TASK_NAME': 'od',
641
+ 'PATTERN': r'([a-zA-Z0-9 ]+)<loc_(\\d+)><loc_(\\d+)><loc_(\\d+)><loc_(\\d+)>'
642
+ },
643
+ {
644
+ 'TASK_NAME': 'ocr',
645
+ 'PATTERN': r'(.+?)<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>',
646
+ 'AREA_THRESHOLD': 0.01
647
+ },
648
+ {
649
+ 'TASK_NAME': 'phrase_grounding',
650
+ 'FILTER_BY_BLACK_LIST': True
651
+ },
652
+ {
653
+ 'TASK_NAME': 'pure_text',
654
+ },
655
+ {
656
+ 'TASK_NAME': 'description_with_bboxes',
657
+ },
658
+ {
659
+ 'TASK_NAME': 'description_with_polygons',
660
+ },
661
+ {
662
+ 'TASK_NAME': 'polygons',
663
+ },
664
+ {
665
+ 'TASK_NAME': 'bboxes',
666
+ },
667
+ {
668
+ 'TASK_NAME': 'description_with_bboxes_or_polygons',
669
+ }
670
+ ]
671
+ }
672
+
673
+ return config
674
+
675
+ def init_quantizers(self):
676
+ # we have box_quantizer (od, grounding) and coordinates_quantizer (ocr, referring_segmentation)
677
+ num_bbox_height_bins = self.config.get('NUM_BBOX_HEIGHT_BINS', 1000)
678
+ num_bbox_width_bins = self.config.get('NUM_BBOX_WIDTH_BINS', 1000)
679
+ box_quantization_mode = self.config.get('BOX_QUANTIZATION_MODE', 'floor')
680
+ self.box_quantizer = BoxQuantizer(
681
+ box_quantization_mode,
682
+ (num_bbox_width_bins, num_bbox_height_bins),
683
+ )
684
+
685
+ num_bbox_height_bins = self.config['COORDINATES_HEIGHT_BINS'] if 'COORDINATES_HEIGHT_BINS' in self.config else self.config.get('NUM_BBOX_HEIGHT_BINS', 1000)
686
+ num_bbox_width_bins = self.config['COORDINATES_WIDTH_BINS'] if 'COORDINATES_WIDTH_BINS' in self.config else self.config.get('NUM_BBOX_WIDTH_BINS', 1000)
687
+ box_quantization_mode = self.config.get('COORDINATES_QUANTIZATION_MODE') if 'COORDINATES_QUANTIZATION_MODE' in self.config else self.config.get('BOX_QUANTIZATION_MODE', 'floor')
688
+ self.coordinates_quantizer = CoordinatesQuantizer(
689
+ box_quantization_mode,
690
+ (num_bbox_width_bins, num_bbox_height_bins),
691
+ )
692
+
693
+ def decode_with_spans(self, tokenizer, token_ids):
694
+ filtered_tokens = tokenizer.convert_ids_to_tokens(
695
+ token_ids, skip_special_tokens=False)
696
+ assert len(filtered_tokens) == len(token_ids)
697
+
698
+ # To avoid mixing byte-level and unicode for byte-level BPT
699
+ # we need to build string separately for added tokens and byte-level tokens
700
+ # cf. https://github.com/huggingface/transformers/issues/1133
701
+ sub_texts = []
702
+ for token in filtered_tokens:
703
+ if token in self.all_special_tokens:
704
+ sub_texts.append(token)
705
+ else:
706
+ if isinstance(tokenizer, (BartTokenizer, BartTokenizerFast)):
707
+ sub_text = tokenizer.convert_tokens_to_string([token])
708
+ elif isinstance(tokenizer, (T5Tokenizer, T5TokenizerFast)):
709
+ # Ref: https://github.com/google/sentencepiece#whitespace-is-treated-as-a-basic-symbol
710
+ # Note: Do not strip sub_text as it may have functional whitespace
711
+ sub_text = token.replace('▁', ' ')
712
+ else:
713
+ raise ValueError(f'type {type(tokenizer)} not supported')
714
+ sub_texts.append(sub_text)
715
+
716
+ text = ''
717
+ spans = []
718
+ for sub_text in sub_texts:
719
+ span = (len(text), len(text) + len(sub_text)) # [start index, end index).
720
+ text += sub_text
721
+ spans.append(span)
722
+
723
+ # Text format:
724
+ # 1. T5Tokenizer/T5TokenizerFast:
725
+ # "<loc_1><loc_2><loc_3><loc_4> transplanting dog<loc_1><loc_2><loc_3><loc_4> cat</s>"
726
+ # Equivalent to t5_tokenizer.decode(input_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False, spaces_between_special_tokens=False)
727
+ # 2. BartTokenizer (need to double check):
728
+ # "<s><loc_1><loc_2><loc_3><loc_4>transplanting dog<loc_1><loc_2><loc_3><loc_4>cat</s>"
729
+ # Equivalent to bart_tokenizer.decode(input_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False, spaces_between_special_tokens=False)
730
+ return text, spans
731
+
732
+ def parse_od_from_text_and_spans(
733
+ self,
734
+ text,
735
+ pattern,
736
+ image_size,
737
+ phrase_centric=False
738
+ ):
739
+ parsed = list(re.finditer(pattern, text))
740
+
741
+ instances = []
742
+ for i in range(len(parsed)):
743
+ # Prepare instance.
744
+ instance = {}
745
+
746
+ if phrase_centric:
747
+ bbox_bins = [int(parsed[i].group(j)) for j in range(2, 6)]
748
+ else:
749
+ bbox_bins = [int(parsed[i].group(j)) for j in range(1, 5)]
750
+ instance['bbox'] = self.box_quantizer.dequantize(
751
+ boxes=torch.tensor(bbox_bins),
752
+ size=image_size
753
+ ).tolist()
754
+
755
+ if phrase_centric:
756
+ instance['cat_name'] = parsed[i].group(1).lower().strip()
757
+ else:
758
+ instance['cat_name'] = parsed[i].group(5).lower().strip()
759
+ instances.append(instance)
760
+
761
+ return instances
762
+
763
+ def parse_ocr_from_text_and_spans(self,
764
+ text,
765
+ pattern,
766
+ image_size,
767
+ area_threshold=-1.0,
768
+ ):
769
+ bboxes = []
770
+ labels = []
771
+ text = text.replace('<s>', '')
772
+ # ocr with regions
773
+ parsed = re.findall(pattern, text)
774
+ instances = []
775
+ image_width, image_height = image_size
776
+
777
+ for ocr_line in parsed:
778
+ ocr_content = ocr_line[0]
779
+ quad_box = ocr_line[1:]
780
+ quad_box = [int(i) for i in quad_box]
781
+ quad_box = self.coordinates_quantizer.dequantize(
782
+ torch.tensor(np.array(quad_box).reshape(-1, 2)),
783
+ size=image_size
784
+ ).reshape(-1).tolist()
785
+
786
+ if area_threshold > 0:
787
+ x_coords = [i for i in quad_box[0::2]]
788
+ y_coords = [i for i in quad_box[1::2]]
789
+
790
+ # apply the Shoelace formula
791
+ area = 0.5 * abs(sum(x_coords[i] * y_coords[i + 1] - x_coords[i + 1] * y_coords[i] for i in range(4 - 1)))
792
+
793
+ if area < (image_width * image_height) * area_threshold:
794
+ continue
795
+
796
+ bboxes.append(quad_box)
797
+ labels.append(ocr_content)
798
+ instances.append({
799
+ 'quad_box': quad_box,
800
+ 'text': ocr_content,
801
+ })
802
+ return instances
803
+
804
+ def parse_phrase_grounding_from_text_and_spans(self, text, pattern, image_size):
805
+ # ignore <s> </s> and <pad>
806
+ cur_span = 0
807
+ if text.startswith('<s>'):
808
+ cur_span += 3
809
+
810
+ text = text.replace('<s>', '')
811
+ text = text.replace('</s>', '')
812
+ text = text.replace('<pad>', '')
813
+
814
+ pattern = r"([^<]+(?:<loc_\d+>){4,})"
815
+ phrases = re.findall(pattern, text)
816
+
817
+ # pattern should be text pattern and od pattern
818
+ pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_)'
819
+ box_pattern = r'<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>'
820
+
821
+ instances = []
822
+ for pharse_text in phrases:
823
+ phrase_text_strip = pharse_text.replace('<ground>', '', 1)
824
+ phrase_text_strip = pharse_text.replace('<obj>', '', 1)
825
+
826
+ if phrase_text_strip == '':
827
+ cur_span += len(pharse_text)
828
+ continue
829
+
830
+ # Prepare instance.
831
+ instance = {}
832
+
833
+ # parse phrase, get string
834
+ phrase = re.search(pattern, phrase_text_strip)
835
+ if phrase is None:
836
+ cur_span += len(pharse_text)
837
+ continue
838
+
839
+ # parse bboxes by box_pattern
840
+ bboxes_parsed = list(re.finditer(box_pattern, pharse_text))
841
+ if len(bboxes_parsed) == 0:
842
+ cur_span += len(pharse_text)
843
+ continue
844
+
845
+ phrase = phrase.group()
846
+ # remove leading and trailing spaces
847
+ phrase = phrase.strip()
848
+
849
+ if phrase in self.black_list_of_phrase_grounding:
850
+ cur_span += len(pharse_text)
851
+ continue
852
+
853
+ # a list of list
854
+ bbox_bins = [[int(_bboxes_parsed.group(j)) for j in range(1, 5)] for _bboxes_parsed in bboxes_parsed]
855
+ instance['bbox'] = self.box_quantizer.dequantize(
856
+ boxes=torch.tensor(bbox_bins),
857
+ size=image_size
858
+ ).tolist()
859
+
860
+ # exclude non-ascii characters
861
+ phrase = phrase.encode('ascii',errors='ignore').decode('ascii')
862
+ instance['cat_name'] = phrase
863
+
864
+ instances.append(instance)
865
+
866
+ return instances
867
+
868
+ def parse_description_with_bboxes_from_text_and_spans(self, text, pattern, image_size, allow_empty_phrase=False):
869
+ # temporary parse solution, split by '.'
870
+ # ignore <s> </s> and <pad>
871
+
872
+ text = text.replace('<s>', '')
873
+ text = text.replace('</s>', '')
874
+ text = text.replace('<pad>', '')
875
+
876
+ if allow_empty_phrase:
877
+ pattern = rf"(?:(?:<loc_\d+>){{4,}})"
878
+ else:
879
+ pattern = r"([^<]+(?:<loc_\d+>){4,})"
880
+ phrases = re.findall(pattern, text)
881
+
882
+ # pattern should be text pattern and od pattern
883
+ pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_)'
884
+ box_pattern = r'<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>'
885
+
886
+ instances = []
887
+ for pharse_text in phrases:
888
+ phrase_text_strip = pharse_text.replace('<ground>', '', 1)
889
+ phrase_text_strip = pharse_text.replace('<obj>', '', 1)
890
+
891
+ if phrase_text_strip == '' and not allow_empty_phrase:
892
+ continue
893
+
894
+ # parse phrase, get string
895
+ phrase = re.search(pattern, phrase_text_strip)
896
+ if phrase is None:
897
+ continue
898
+
899
+ phrase = phrase.group()
900
+ # remove leading and trailing spaces
901
+ phrase = phrase.strip()
902
+
903
+ # parse bboxes by box_pattern
904
+ bboxes_parsed = list(re.finditer(box_pattern, pharse_text))
905
+ if len(bboxes_parsed) == 0:
906
+ continue
907
+
908
+ # a list of list
909
+ bbox_bins = [[int(_bboxes_parsed.group(j)) for j in range(1, 5)] for _bboxes_parsed in bboxes_parsed]
910
+
911
+ bboxes = self.box_quantizer.dequantize(
912
+ boxes=torch.tensor(bbox_bins),
913
+ size=image_size
914
+ ).tolist()
915
+
916
+ phrase = phrase.encode('ascii',errors='ignore').decode('ascii')
917
+ for _bboxes in bboxes:
918
+ # Prepare instance.
919
+ instance = {}
920
+ instance['bbox'] = _bboxes
921
+ # exclude non-ascii characters
922
+ instance['cat_name'] = phrase
923
+ instances.append(instance)
924
+
925
+ return instances
926
+
927
+ def parse_description_with_polygons_from_text_and_spans(self, text, pattern, image_size,
928
+ allow_empty_phrase=False,
929
+ polygon_sep_token='<sep>',
930
+ polygon_start_token='<poly>',
931
+ polygon_end_token='</poly>',
932
+ with_box_at_start=False,
933
+ ):
934
+
935
+ # ref_seg format: '<expression><x1><y1><x2><y2><><><sep><><><><>'
936
+ # ignore <s> </s> and <pad>
937
+
938
+ text = text.replace('<s>', '')
939
+ text = text.replace('</s>', '')
940
+ text = text.replace('<pad>', '')
941
+
942
+ if allow_empty_phrase:
943
+ pattern = rf"(?:(?:<loc_\d+>|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
944
+ else:
945
+ # [^<]+: This part matches one or more characters that are not the < symbol.
946
+ # The ^ inside the square brackets [] is a negation, meaning it matches anything except <.
947
+ #
948
+ pattern = rf"([^<]+(?:<loc_\d+>|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
949
+ phrases = re.findall(pattern, text)
950
+
951
+ phrase_string_pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_|<poly>)'
952
+ box_pattern = rf'((?:<loc_\d+>)+)(?:{re.escape(polygon_sep_token)}|$)'
953
+
954
+ # one polygons instance is separated by polygon_start_token and polygon_end_token
955
+ polygons_instance_pattern = rf'{re.escape(polygon_start_token)}(.*?){re.escape(polygon_end_token)}'
956
+
957
+ instances = []
958
+ for phrase_text in phrases:
959
+
960
+ # exclude loc_\d+>
961
+ # need to get span if want to include category score
962
+ phrase_text_strip = re.sub(r'^loc_\d+>', '', phrase_text, count=1)
963
+
964
+ # phrase = phrase.replace('<poly>', '')
965
+ # phrase = phrase.replace('poly>', '')
966
+
967
+ if phrase_text_strip == '' and not allow_empty_phrase:
968
+ continue
969
+
970
+
971
+ # parse phrase, get string
972
+ phrase = re.search(phrase_string_pattern, phrase_text_strip)
973
+ if phrase is None:
974
+ continue
975
+ phrase = phrase.group()
976
+ # remove leading and trailing spaces
977
+ phrase = phrase.strip()
978
+
979
+ # parse bboxes by box_pattern
980
+
981
+ # split by polygon_start_token and polygon_end_token first using polygons_instance_pattern
982
+ if polygon_start_token in phrase_text and polygon_end_token in phrase_text:
983
+ polygons_instances_parsed = list(re.finditer(polygons_instance_pattern, phrase_text))
984
+ else:
985
+ polygons_instances_parsed = [phrase_text]
986
+
987
+ for _polygons_instances_parsed in polygons_instances_parsed:
988
+ # Prepare instance.
989
+ instance = {}
990
+
991
+ # polygons_parsed= list(re.finditer(box_pattern, phrase_text))
992
+ if isinstance(_polygons_instances_parsed, str):
993
+ polygons_parsed= list(re.finditer(box_pattern, _polygons_instances_parsed))
994
+ else:
995
+ polygons_parsed= list(re.finditer(box_pattern, _polygons_instances_parsed.group(1)))
996
+ if len(polygons_parsed) == 0:
997
+ continue
998
+
999
+ # a list of list (polygon)
1000
+ bbox = []
1001
+ polygons = []
1002
+ for _polygon_parsed in polygons_parsed:
1003
+ # group 1: whole <loc_\d+>...</loc_\d+>
1004
+ _polygon = _polygon_parsed.group(1)
1005
+ # parse into list of int
1006
+ _polygon = [int(_loc_parsed.group(1)) for _loc_parsed in re.finditer(r'<loc_(\d+)>', _polygon)]
1007
+ if with_box_at_start and len(bbox) == 0:
1008
+ if len(_polygon) > 4:
1009
+ # no valid bbox prediction
1010
+ bbox = _polygon[:4]
1011
+ _polygon = _polygon[4:]
1012
+ else:
1013
+ bbox = [0, 0, 0, 0]
1014
+ # abandon last element if is not paired
1015
+ if len(_polygon) % 2 == 1:
1016
+ _polygon = _polygon[:-1]
1017
+
1018
+ # reshape into (n, 2)
1019
+ _polygon = self.coordinates_quantizer.dequantize(
1020
+ torch.tensor(np.array(_polygon).reshape(-1, 2)),
1021
+ size=image_size
1022
+ ).reshape(-1).tolist()
1023
+ # reshape back
1024
+ polygons.append(_polygon)
1025
+
1026
+ instance['cat_name'] = phrase
1027
+ instance['polygons'] = polygons
1028
+ if len(bbox) != 0:
1029
+ instance['bbox'] = self.box_quantizer.dequantize(
1030
+ boxes=torch.tensor([bbox]),
1031
+ size=image_size
1032
+ ).tolist()[0]
1033
+
1034
+ instances.append(instance)
1035
+
1036
+ return instances
1037
+
1038
+ def __call__(
1039
+ self,
1040
+ text=None,
1041
+ image_size=None,
1042
+ parse_tasks=None,
1043
+ ):
1044
+ """
1045
+ Args:
1046
+ text: model outputs
1047
+ image_size: (width, height)
1048
+ parse_tasks: a list of tasks to parse, if None, parse all tasks.
1049
+
1050
+ """
1051
+ if parse_tasks is not None:
1052
+ if isinstance(parse_tasks, str):
1053
+ parse_tasks = [parse_tasks]
1054
+ for _parse_task in parse_tasks:
1055
+ assert _parse_task in self.parse_tasks, f'parse task {_parse_task} not supported'
1056
+
1057
+ # sequence or text should be provided
1058
+ assert text is not None, 'text should be provided'
1059
+
1060
+ parsed_dict = {
1061
+ 'text': text
1062
+ }
1063
+
1064
+ for task in self.parse_tasks:
1065
+ if parse_tasks is not None and task not in parse_tasks:
1066
+ continue
1067
+
1068
+ pattern = self.parse_tasks_configs[task].get('PATTERN', None)
1069
+
1070
+ if task == 'ocr':
1071
+ instances = self.parse_ocr_from_text_and_spans(
1072
+ text,
1073
+ pattern=pattern,
1074
+ image_size=image_size,
1075
+ area_threshold=self.parse_tasks_configs[task].get('AREA_THRESHOLD', 0.01),
1076
+ )
1077
+ parsed_dict['ocr'] = instances
1078
+ elif task == 'phrase_grounding':
1079
+ instances = self.parse_phrase_grounding_from_text_and_spans(
1080
+ text,
1081
+ pattern=pattern,
1082
+ image_size=image_size,
1083
+ )
1084
+ parsed_dict['phrase_grounding'] = instances
1085
+ elif task == 'pure_text':
1086
+ parsed_dict['pure_text'] = text
1087
+ elif task == 'description_with_bboxes':
1088
+ instances = self.parse_description_with_bboxes_from_text_and_spans(
1089
+ text,
1090
+ pattern=pattern,
1091
+ image_size=image_size,
1092
+ )
1093
+ parsed_dict['description_with_bboxes'] = instances
1094
+ elif task == 'description_with_polygons':
1095
+ instances = self.parse_description_with_polygons_from_text_and_spans(
1096
+ text,
1097
+ pattern=pattern,
1098
+ image_size=image_size,
1099
+ )
1100
+ parsed_dict['description_with_polygons'] = instances
1101
+ elif task == 'polygons':
1102
+ instances = self.parse_description_with_polygons_from_text_and_spans(
1103
+ text,
1104
+ pattern=pattern,
1105
+ image_size=image_size,
1106
+ allow_empty_phrase=True,
1107
+ )
1108
+ parsed_dict['polygons'] = instances
1109
+ elif task == 'bboxes':
1110
+ instances = self.parse_description_with_bboxes_from_text_and_spans(
1111
+ text,
1112
+ pattern=pattern,
1113
+ image_size=image_size,
1114
+ allow_empty_phrase=True,
1115
+ )
1116
+ parsed_dict['bboxes'] = instances
1117
+ elif task == 'description_with_bboxes_or_polygons':
1118
+ if '<poly>' in text:
1119
+ # only support either polygons or bboxes, not both at the same time
1120
+ instances = self.parse_description_with_polygons_from_text_and_spans(
1121
+ text,
1122
+ pattern=pattern,
1123
+ image_size=image_size,
1124
+ )
1125
+ else:
1126
+ instances = self.parse_description_with_bboxes_from_text_and_spans(
1127
+ text,
1128
+ pattern=pattern,
1129
+ image_size=image_size,
1130
+ )
1131
+ parsed_dict['description_with_bboxes_or_polygons'] = instances
1132
+ else:
1133
+ raise ValueError("task {} is not supported".format(task))
1134
+
1135
+ return parsed_dict
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
199
+ "clean_up_tokenization_spaces": false,
200
+ "eos_token": "<|endoftext|>",
201
+ "errors": "replace",
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "split_special_tokens": false,
205
+ "tokenizer_class": "Qwen2Tokenizer",
206
+ "unk_token": null
207
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff