|
{ |
|
"architectures": [ |
|
"Qwen2VisionTransformerPretrainedModel" |
|
], |
|
"vision_config": { |
|
"depth": 32, |
|
"embed_dim": 1280, |
|
"hidden_act": "quick_gelu", |
|
"hidden_size": 1536, |
|
"in_channels": 3, |
|
"in_chans": 3, |
|
"initializer_range": 0.02, |
|
"mlp_ratio": 4, |
|
"num_heads": 16, |
|
"patch_size": 14, |
|
"spatial_merge_size": 2, |
|
"spatial_patch_size": 14, |
|
"temporal_patch_size": 2, |
|
"initializer_range": 0.02, |
|
"_attn_implementation": "flash_attention_2" |
|
}, |
|
"model_type": "qwen2_vl", |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.45.0" |
|
} |
|
|