{ "_name_or_path": "Salesforce/blip-vqa-base", "architectures": [ "ViltForQuestionAnswering" ], "attention_probs_dropout_prob": 0.0, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "at table", "1": "skateboard", "2": "lg", "3": "6", "4": "crossing", "5": "don't know", "6": "solid", "7": "picnic table", "8": "full", "9": "plain", "10": "window", "11": "8:35", "12": "red and yellow", "13": "girl", "14": "tabby", "15": "blue", "16": "7:45", "17": "down", "18": "unknown", "19": "hawaii", "20": "woods", "21": "little girl", "22": "roof", "23": "black and white", "24": "in car", "25": "clock tower", "26": "gray", "27": "curtains", "28": "ball", "29": "dog", "30": "woman", "31": "soccer ball", "32": "windows", "33": "donut", "34": "screen", "35": "bus", "36": "neon", "37": "monitor", "38": "jeep", "39": "snowboard", "40": "wine tasting", "41": "french", "42": "wedding", "43": "orange", "44": "king", "45": "tired", "46": "canopy", "47": "low", "48": "bikes", "49": "snowboarding", "50": "2000", "51": "skateboarding", "52": "style", "53": "tent", "54": "necklace", "55": "bike rack", "56": "lying down", "57": "clock", "58": "name tag", "59": "hat", "60": "backpack", "61": "on street", "62": "air", "63": "leather", "64": "2010", "65": "can't tell", "66": "bicycle", "67": "lady", "68": "clear", "69": "tan", "70": "skier", "71": "car", "72": "hair", "73": "curtain", "74": "10", "75": "exit", "76": "natural", "77": "camera", "78": "forest", "79": "station", "80": "skiing", "81": "tv", "82": "fence", "83": "smiling", "84": "platform", "85": "happy", "86": "bedroom", "87": "blonde", "88": "double", "89": "train", "90": "nothing", "91": "street", "92": "soccer", "93": "table", "94": "5", "95": "trees", "96": "women", "97": "giraffes", "98": "right", "99": "7", "100": "shelter", "101": "ground", "102": "plate", "103": "laying down", "104": "chopsticks", "105": "red", "106": "many", "107": "shrimp", "108": "not there", "109": "talking", "110": "cloudy", "111": "green", "112": "bicycles", "113": "bricks", "114": "sun", "115": "2013", "116": "brick", "117": "human", "118": "birthday", "119": "snowboarder", "120": "park", "121": "beagle", "122": "yes", "123": "walking", "124": "rack", "125": "purple", "126": "cat", "127": "giraffe", "128": "8", "129": "pink", "130": "plastic", "131": "red and blue", "132": "stripes", "133": "lanyard", "134": "shade", "135": "dirt", "136": "they aren't", "137": "0", "138": "ice cream", "139": "zoo", "140": "wall", "141": "cup", "142": "queen", "143": "cage", "144": "africa", "145": "beige", "146": "white", "147": "snow", "148": "yellow", "149": "white and blue", "150": "calico", "151": "big ben", "152": "wine", "153": "sky", "154": "security", "155": "2", "156": "sidewalk", "157": "stand", "158": "4", "159": "smile", "160": "gray and black", "161": "protection", "162": "3", "163": "watching", "164": "shadow", "165": "shadows", "166": "fashion", "167": "7:35", "168": "crown", "169": "blue and white", "170": "man", "171": "door", "172": "sleeping", "173": "large", "174": "net", "175": "suv", "176": "brown", "177": "not sure", "178": "arrow", "179": "1", "180": "black", "181": "out", "182": "person", "183": "desert", "184": "boy", "185": "tower", "186": "9:35", "187": "chair", "188": "talking on phone", "189": "small", "190": "resting", "191": "church", "192": "outside", "193": "cross", "194": "white and black", "195": "no", "196": "photographer", "197": "on road", "198": "doughnut" }, "image_size": 384, "image_text_hidden_size": 256, "initializer_factor": 1.0, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "0": 137, "1": 179, "10": 74, "2": 155, "2000": 50, "2010": 64, "2013": 115, "3": 162, "4": 158, "5": 94, "6": 3, "7": 99, "7:35": 167, "7:45": 16, "8": 128, "8:35": 11, "9:35": 186, "africa": 144, "air": 62, "arrow": 178, "at table": 0, "backpack": 60, "ball": 28, "beagle": 121, "bedroom": 86, "beige": 145, "bicycle": 66, "bicycles": 112, "big ben": 151, "bike rack": 55, "bikes": 48, "birthday": 118, "black": 180, "black and white": 23, "blonde": 87, "blue": 15, "blue and white": 169, "boy": 184, "brick": 116, "bricks": 113, "brown": 176, "bus": 35, "cage": 143, "calico": 150, "camera": 77, "can't tell": 65, "canopy": 46, "car": 71, "cat": 126, "chair": 187, "chopsticks": 104, "church": 191, "clear": 68, "clock": 57, "clock tower": 25, "cloudy": 110, "cross": 193, "crossing": 4, "crown": 168, "cup": 141, "curtain": 73, "curtains": 27, "desert": 183, "dirt": 135, "dog": 29, "don't know": 5, "donut": 33, "door": 171, "double": 88, "doughnut": 198, "down": 17, "exit": 75, "fashion": 166, "fence": 82, "forest": 78, "french": 41, "full": 8, "giraffe": 127, "giraffes": 97, "girl": 13, "gray": 26, "gray and black": 160, "green": 111, "ground": 101, "hair": 72, "happy": 85, "hat": 59, "hawaii": 19, "human": 117, "ice cream": 138, "in car": 24, "jeep": 38, "king": 44, "lady": 67, "lanyard": 133, "large": 173, "laying down": 103, "leather": 63, "lg": 2, "little girl": 21, "low": 47, "lying down": 56, "man": 170, "many": 106, "monitor": 37, "name tag": 58, "natural": 76, "necklace": 54, "neon": 36, "net": 174, "no": 195, "not sure": 177, "not there": 108, "nothing": 90, "on road": 197, "on street": 61, "orange": 43, "out": 181, "outside": 192, "park": 120, "person": 182, "photographer": 196, "picnic table": 7, "pink": 129, "plain": 9, "plastic": 130, "plate": 102, "platform": 84, "protection": 161, "purple": 125, "queen": 142, "rack": 124, "red": 105, "red and blue": 131, "red and yellow": 12, "resting": 190, "right": 98, "roof": 22, "screen": 34, "security": 154, "shade": 134, "shadow": 164, "shadows": 165, "shelter": 100, "shrimp": 107, "sidewalk": 156, "skateboard": 1, "skateboarding": 51, "skier": 70, "skiing": 80, "sky": 153, "sleeping": 172, "small": 189, "smile": 159, "smiling": 83, "snow": 147, "snowboard": 39, "snowboarder": 119, "snowboarding": 49, "soccer": 92, "soccer ball": 31, "solid": 6, "stand": 157, "station": 79, "street": 91, "stripes": 132, "style": 52, "sun": 114, "suv": 175, "tabby": 14, "table": 93, "talking": 109, "talking on phone": 188, "tan": 69, "tent": 53, "they aren't": 136, "tired": 45, "tower": 185, "train": 89, "trees": 95, "tv": 81, "unknown": 18, "walking": 123, "wall": 140, "watching": 163, "wedding": 42, "white": 146, "white and black": 194, "white and blue": 149, "window": 10, "windows": 32, "wine": 152, "wine tasting": 40, "woman": 30, "women": 96, "woods": 20, "yellow": 148, "yes": 122, "zoo": 139 }, "layer_norm_eps": 1e-12, "logit_scale_init_value": 2.6592, "max_image_length": -1, "max_position_embeddings": 40, "modality_type_vocab_size": 2, "model_type": "vilt", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "num_images": -1, "patch_size": 32, "projection_dim": 512, "qkv_bias": true, "text_config": { "_name_or_path": "", "add_cross_attention": false, "architectures": null, "attention_probs_dropout_prob": 0.0, "bad_words_ids": null, "begin_suppress_tokens": null, "bos_token_id": 30522, "chunk_size_feed_forward": 0, "cross_attention_hidden_size": null, "decoder_start_token_id": null, "diversity_penalty": 0.0, "do_sample": false, "early_stopping": false, "encoder_no_repeat_ngram_size": 0, "eos_token_id": 2, "exponential_decay_length_penalty": null, "finetuning_task": null, "forced_bos_token_id": null, "forced_eos_token_id": null, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "initializer_factor": 1.0, "initializer_range": 0.02, "intermediate_size": 3072, "is_decoder": true, "is_encoder_decoder": false, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "layer_norm_eps": 1e-12, "length_penalty": 1.0, "max_length": 20, "max_position_embeddings": 512, "min_length": 0, "model_type": "blip_text_model", "no_repeat_ngram_size": 0, "num_attention_heads": 12, "num_beam_groups": 1, "num_beams": 1, "num_hidden_layers": 12, "num_return_sequences": 1, "output_attentions": false, "output_hidden_states": false, "output_scores": false, "pad_token_id": 0, "prefix": null, "problem_type": null, "projection_dim": 768, "pruned_heads": {}, "remove_invalid_values": false, "repetition_penalty": 1.0, "return_dict": true, "return_dict_in_generate": false, "sep_token_id": 102, "suppress_tokens": null, "task_specific_params": null, "temperature": 1.0, "tf_legacy_loss": false, "tie_encoder_decoder": false, "tie_word_embeddings": true, "tokenizer_class": null, "top_k": 50, "top_p": 1.0, "torch_dtype": null, "torchscript": false, "transformers_version": "4.26.0.dev0", "typical_p": 1.0, "use_bfloat16": false, "use_cache": true, "vocab_size": 30524 }, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.42.4", "type_vocab_size": 2, "vision_config": { "_name_or_path": "", "add_cross_attention": false, "architectures": null, "attention_dropout": 0.0, "bad_words_ids": null, "begin_suppress_tokens": null, "bos_token_id": null, "chunk_size_feed_forward": 0, "cross_attention_hidden_size": null, "decoder_start_token_id": null, "diversity_penalty": 0.0, "do_sample": false, "dropout": 0.0, "early_stopping": false, "encoder_no_repeat_ngram_size": 0, "eos_token_id": null, "exponential_decay_length_penalty": null, "finetuning_task": null, "forced_bos_token_id": null, "forced_eos_token_id": null, "hidden_act": "gelu", "hidden_size": 768, "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "image_size": 384, "initializer_factor": 1.0, "initializer_range": 0.02, "intermediate_size": 3072, "is_decoder": false, "is_encoder_decoder": false, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "layer_norm_eps": 1e-05, "length_penalty": 1.0, "max_length": 20, "min_length": 0, "model_type": "blip_vision_model", "no_repeat_ngram_size": 0, "num_attention_heads": 12, "num_beam_groups": 1, "num_beams": 1, "num_channels": 3, "num_hidden_layers": 12, "num_return_sequences": 1, "output_attentions": false, "output_hidden_states": false, "output_scores": false, "pad_token_id": null, "patch_size": 16, "prefix": null, "problem_type": null, "projection_dim": 512, "pruned_heads": {}, "remove_invalid_values": false, "repetition_penalty": 1.0, "return_dict": true, "return_dict_in_generate": false, "sep_token_id": null, "suppress_tokens": null, "task_specific_params": null, "temperature": 1.0, "tf_legacy_loss": false, "tie_encoder_decoder": false, "tie_word_embeddings": true, "tokenizer_class": null, "top_k": 50, "top_p": 1.0, "torch_dtype": null, "torchscript": false, "transformers_version": "4.26.0.dev0", "typical_p": 1.0, "use_bfloat16": false }, "vocab_size": 30522 }