{ "_name_or_path": "dandelin/vilt-b32-mlm", "architectures": [ "ViltForQuestionAnswering" ], "attention_probs_dropout_prob": 0.0, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "2", "1": "green", "2": "black", "3": "double", "4": "woods", "5": "on street", "6": "soccer", "7": "skateboarding", "8": "4", "9": "bikes", "10": "car", "11": "woman", "12": "tan", "13": "person", "14": "out", "15": "can't tell", "16": "orange", "17": "stand", "18": "donut", "19": "ground", "20": "style", "21": "tent", "22": "lady", "23": "5", "24": "bus", "25": "snowboard", "26": "bricks", "27": "6", "28": "7:35", "29": "plate", "30": "air", "31": "cloudy", "32": "skiing", "33": "security", "34": "church", "35": "crown", "36": "women", "37": "bicycle", "38": "white and black", "39": "clear", "40": "9:35", "41": "wedding", "42": "watching", "43": "fashion", "44": "shadow", "45": "talking", "46": "man", "47": "pink", "48": "10", "49": "outside", "50": "blue and white", "51": "rack", "52": "hair", "53": "sun", "54": "wine tasting", "55": "2000", "56": "name tag", "57": "sidewalk", "58": "door", "59": "curtain", "60": "doughnut", "61": "skier", "62": "smile", "63": "desert", "64": "talking on phone", "65": "cage", "66": "shadows", "67": "down", "68": "cross", "69": "yes", "70": "window", "71": "bicycles", "72": "beige", "73": "roof", "74": "don't know", "75": "human", "76": "smiling", "77": "sleeping", "78": "curtains", "79": "dog", "80": "brown", "81": "wine", "82": "birthday", "83": "platform", "84": "cup", "85": "forest", "86": "nothing", "87": "neon", "88": "sky", "89": "little girl", "90": "2010", "91": "crossing", "92": "7", "93": "white", "94": "chair", "95": "windows", "96": "3", "97": "8", "98": "suv", "99": "plastic", "100": "wall", "101": "ice cream", "102": "park", "103": "unknown", "104": "chopsticks", "105": "arrow", "106": "0", "107": "tv", "108": "plain", "109": "calico", "110": "table", "111": "dirt", "112": "africa", "113": "bike rack", "114": "black and white", "115": "train", "116": "bedroom", "117": "soccer ball", "118": "8:35", "119": "girl", "120": "right", "121": "photographer", "122": "lying down", "123": "no", "124": "white and blue", "125": "gray and black", "126": "hat", "127": "station", "128": "7:45", "129": "leather", "130": "street", "131": "full", "132": "not sure", "133": "they aren't", "134": "red", "135": "camera", "136": "tabby", "137": "ball", "138": "lanyard", "139": "trees", "140": "exit", "141": "jeep", "142": "giraffes", "143": "walking", "144": "king", "145": "french", "146": "picnic table", "147": "zoo", "148": "brick", "149": "big ben", "150": "fence", "151": "snowboarder", "152": "protection", "153": "small", "154": "backpack", "155": "blue", "156": "purple", "157": "shade", "158": "solid", "159": "snowboarding", "160": "1", "161": "resting", "162": "queen", "163": "red and blue", "164": "giraffe", "165": "yellow", "166": "cat", "167": "on road", "168": "low", "169": "in car", "170": "red and yellow", "171": "boy", "172": "hawaii", "173": "snow", "174": "laying down", "175": "stripes", "176": "skateboard", "177": "beagle", "178": "many", "179": "canopy", "180": "at table", "181": "monitor", "182": "clock tower", "183": "net", "184": "clock", "185": "tired", "186": "2013", "187": "screen", "188": "gray", "189": "shrimp", "190": "tower", "191": "large", "192": "necklace", "193": "lg", "194": "happy", "195": "not there", "196": "blonde", "197": "natural", "198": "shelter" }, "image_size": 384, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "0": 106, "1": 160, "10": 48, "2": 0, "2000": 55, "2010": 90, "2013": 186, "3": 96, "4": 8, "5": 23, "6": 27, "7": 92, "7:35": 28, "7:45": 128, "8": 97, "8:35": 118, "9:35": 40, "africa": 112, "air": 30, "arrow": 105, "at table": 180, "backpack": 154, "ball": 137, "beagle": 177, "bedroom": 116, "beige": 72, "bicycle": 37, "bicycles": 71, "big ben": 149, "bike rack": 113, "bikes": 9, "birthday": 82, "black": 2, "black and white": 114, "blonde": 196, "blue": 155, "blue and white": 50, "boy": 171, "brick": 148, "bricks": 26, "brown": 80, "bus": 24, "cage": 65, "calico": 109, "camera": 135, "can't tell": 15, "canopy": 179, "car": 10, "cat": 166, "chair": 94, "chopsticks": 104, "church": 34, "clear": 39, "clock": 184, "clock tower": 182, "cloudy": 31, "cross": 68, "crossing": 91, "crown": 35, "cup": 84, "curtain": 59, "curtains": 78, "desert": 63, "dirt": 111, "dog": 79, "don't know": 74, "donut": 18, "door": 58, "double": 3, "doughnut": 60, "down": 67, "exit": 140, "fashion": 43, "fence": 150, "forest": 85, "french": 145, "full": 131, "giraffe": 164, "giraffes": 142, "girl": 119, "gray": 188, "gray and black": 125, "green": 1, "ground": 19, "hair": 52, "happy": 194, "hat": 126, "hawaii": 172, "human": 75, "ice cream": 101, "in car": 169, "jeep": 141, "king": 144, "lady": 22, "lanyard": 138, "large": 191, "laying down": 174, "leather": 129, "lg": 193, "little girl": 89, "low": 168, "lying down": 122, "man": 46, "many": 178, "monitor": 181, "name tag": 56, "natural": 197, "necklace": 192, "neon": 87, "net": 183, "no": 123, "not sure": 132, "not there": 195, "nothing": 86, "on road": 167, "on street": 5, "orange": 16, "out": 14, "outside": 49, "park": 102, "person": 13, "photographer": 121, "picnic table": 146, "pink": 47, "plain": 108, "plastic": 99, "plate": 29, "platform": 83, "protection": 152, "purple": 156, "queen": 162, "rack": 51, "red": 134, "red and blue": 163, "red and yellow": 170, "resting": 161, "right": 120, "roof": 73, "screen": 187, "security": 33, "shade": 157, "shadow": 44, "shadows": 66, "shelter": 198, "shrimp": 189, "sidewalk": 57, "skateboard": 176, "skateboarding": 7, "skier": 61, "skiing": 32, "sky": 88, "sleeping": 77, "small": 153, "smile": 62, "smiling": 76, "snow": 173, "snowboard": 25, "snowboarder": 151, "snowboarding": 159, "soccer": 6, "soccer ball": 117, "solid": 158, "stand": 17, "station": 127, "street": 130, "stripes": 175, "style": 20, "sun": 53, "suv": 98, "tabby": 136, "table": 110, "talking": 45, "talking on phone": 64, "tan": 12, "tent": 21, "they aren't": 133, "tired": 185, "tower": 190, "train": 115, "trees": 139, "tv": 107, "unknown": 103, "walking": 143, "wall": 100, "watching": 42, "wedding": 41, "white": 93, "white and black": 38, "white and blue": 124, "window": 70, "windows": 95, "wine": 81, "wine tasting": 54, "woman": 11, "women": 36, "woods": 4, "yellow": 165, "yes": 69, "zoo": 147 }, "layer_norm_eps": 1e-12, "max_image_length": -1, "max_position_embeddings": 40, "modality_type_vocab_size": 2, "model_type": "vilt", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "num_images": -1, "patch_size": 32, "qkv_bias": true, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.33.1", "type_vocab_size": 2, "vocab_size": 30522 }