name: vit_roberta_image2text config_type: model encoder: name: null config_type: model hidden_size: 768 num_hidden_layers: 12 num_attention_heads: 12 intermediate_size: 3072 hidden_act: gelu hidden_dropout_prob: 0.0 attention_probs_dropout_prob: 0.0 initializer_range: 0.02 layer_norm_eps: 1.0e-12 image_size: 224 patch_size: 16 num_channels: 3 qkv_bias: true encoder_stride: 16 decoder: name: null config_type: model is_decoder: true add_cross_attention: true attention_probs_dropout_prob: 0.1 bos_token_id: 0 eos_token_id: 2 classifier_dropout: null gradient_checkpointing: false hidden_act: gelu hidden_dropout_prob: 0.1 hidden_size: 768 initializer_range: 0.02 intermediate_size: 3072 layer_norm_eps: 1.0e-12 max_position_embeddings: 514 num_attention_heads: 12 num_hidden_layers: 12 pad_token_id: 2 position_embedding_type: absolute type_vocab_size: 1 use_cache: true vocab_size: 42000 generation: bos_token_id: 0 decoder_start_token_id: 0 return_dict_in_generate: true early_stopping: true eos_token_id: 2 length_penalty: 2.0 max_length: 64 no_repeat_ngram_size: 3 num_beams: 4 pad_token_id: 1