cccjc commited on
Commit
f3f40fb
Β·
1 Parent(s): d16a60b

add Grok-2-vision & a huge refactoring to clean up code

Browse files
This view is limited to 50 files because it contains too many changes. Β  See raw diff
Files changed (50) hide show
  1. constants.py +4 -2
  2. static/eval_results/Default/Aquila_VL_2B/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  3. static/eval_results/Default/Aria/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  4. static/eval_results/Default/Claude_3.5/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  5. static/eval_results/Default/Claude_3.5_new/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  6. static/eval_results/Default/GPT_4o/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  7. static/eval_results/Default/GPT_4o_mini/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  8. static/eval_results/Default/Gemini_1.5_flash_002/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  9. static/eval_results/Default/Gemini_1.5_pro_002/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  10. static/eval_results/Default/Idefics3/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  11. static/eval_results/Default/InternVL2_2B/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  12. static/eval_results/Default/InternVL2_5_2B/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  13. static/eval_results/Default/InternVL2_5_78B/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  14. static/eval_results/Default/InternVL2_5_8B/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  15. static/eval_results/Default/InternVL2_76B/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  16. static/eval_results/Default/InternVL2_8B/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  17. static/eval_results/Default/Llama_3_2_11B/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  18. static/eval_results/Default/Mammoth_VL/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  19. static/eval_results/Default/MiniCPM_v2.6/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  20. static/eval_results/Default/NVLM/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  21. static/eval_results/Default/Phi-3.5-vision/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  22. static/eval_results/Default/Pixtral_12B/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  23. static/eval_results/Default/Qwen2_VL_2B/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  24. static/eval_results/Default/Qwen2_VL_72B/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  25. static/eval_results/Default/Qwen2_VL_7B/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  26. static/eval_results/Default/llava_onevision_72B/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  27. static/eval_results/Default/llava_onevision_7B/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  28. static/eval_results/SI/Aquila_VL_2B/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  29. static/eval_results/SI/Aria/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  30. static/eval_results/SI/Claude_3.5/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  31. static/eval_results/SI/Claude_3.5_new/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  32. static/eval_results/SI/GPT_4o/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  33. static/eval_results/SI/GPT_4o_mini/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  34. static/eval_results/SI/Gemini_1.5_flash_002/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  35. static/eval_results/SI/Gemini_1.5_pro_002/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  36. static/eval_results/SI/Grok-2-vision-1212/summary_and_keyword_stats.json +213 -0
  37. static/eval_results/SI/Grok-2-vision-1212/task_results.json +2207 -0
  38. static/eval_results/SI/Idefics3/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  39. static/eval_results/SI/InternVL2_2B/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  40. static/eval_results/SI/InternVL2_76B/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  41. static/eval_results/SI/InternVL2_8B/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  42. static/eval_results/SI/Llama_3_2_11B/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  43. static/eval_results/SI/MiniCPM_v2.6/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  44. static/eval_results/SI/Molmo_72B/{summary_results.json β†’ summary_and_keyword_stats.json} +2 -2
  45. static/eval_results/SI/Molmo_72B/task_results.json +2 -2
  46. static/eval_results/SI/Molmo_7B_D/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  47. static/eval_results/SI/NVLM/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  48. static/eval_results/SI/POINTS_15_7B/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  49. static/eval_results/SI/POINTS_7B/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
  50. static/eval_results/SI/Phi-3.5-vision/{summary_results.json β†’ summary_and_keyword_stats.json} +0 -0
constants.py CHANGED
@@ -116,6 +116,7 @@ MODEL_NAME_MAP = {
116
  "InternVL2_5_78B": "InternVL2.5-78B",
117
  "InternVL2_5_2B": "InternVL2.5-2B",
118
  "InternVL2_5_8B": "InternVL2.5-8B",
 
119
  }
120
 
121
  DIMENSION_NAME_MAP = {
@@ -203,14 +204,15 @@ MODEL_URLS = {
203
  "InternVL2_5_78B": "https://huggingface.co/OpenGVLab/InternVL2_5-78B",
204
  "InternVL2_5_2B": "https://huggingface.co/OpenGVLab/InternVL2_5-2B",
205
  "InternVL2_5_8B": "https://huggingface.co/OpenGVLab/InternVL2_5-8B",
 
206
  }
207
 
208
  # Define the base MODEL_GROUPS structure
209
  BASE_MODEL_GROUPS = {
210
  "All": list(MODEL_NAME_MAP.keys()),
211
- "Flagship Models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002', 'Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM', 'Molmo_72B', 'InternVL2_5_78B'],
212
  "Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini', 'Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3', 'Molmo_7B_D', "Aquila_VL_2B", "POINTS_7B", "Qwen2_VL_2B", "InternVL2_2B", "InternVL2_5_2B", "InternVL2_5_8B"],
213
- "Proprietary Flagship models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002'],
214
  "Proprietary Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini'],
215
  "Open-source Flagship Models": ['Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM', "Molmo_72B", "InternVL2_5_78B"],
216
  "Open-source Efficiency Models": ['Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3', 'Molmo_7B_D', "Aquila_VL_2B", "POINTS_7B", "Qwen2_VL_2B", "InternVL2_2B", "InternVL2_5_2B", "InternVL2_5_8B"]
 
116
  "InternVL2_5_78B": "InternVL2.5-78B",
117
  "InternVL2_5_2B": "InternVL2.5-2B",
118
  "InternVL2_5_8B": "InternVL2.5-8B",
119
+ "Grok-2-vision-1212": "Grok-2-vision-1212",
120
  }
121
 
122
  DIMENSION_NAME_MAP = {
 
204
  "InternVL2_5_78B": "https://huggingface.co/OpenGVLab/InternVL2_5-78B",
205
  "InternVL2_5_2B": "https://huggingface.co/OpenGVLab/InternVL2_5-2B",
206
  "InternVL2_5_8B": "https://huggingface.co/OpenGVLab/InternVL2_5-8B",
207
+ "Grok-2-vision-1212": "https://x.ai/blog/grok-1212",
208
  }
209
 
210
  # Define the base MODEL_GROUPS structure
211
  BASE_MODEL_GROUPS = {
212
  "All": list(MODEL_NAME_MAP.keys()),
213
+ "Flagship Models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002', 'Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM', 'Molmo_72B', 'InternVL2_5_78B', 'Grok-2-vision-1212'],
214
  "Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini', 'Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3', 'Molmo_7B_D', "Aquila_VL_2B", "POINTS_7B", "Qwen2_VL_2B", "InternVL2_2B", "InternVL2_5_2B", "InternVL2_5_8B"],
215
+ "Proprietary Flagship models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002', 'Grok-2-vision-1212'],
216
  "Proprietary Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini'],
217
  "Open-source Flagship Models": ['Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM', "Molmo_72B", "InternVL2_5_78B"],
218
  "Open-source Efficiency Models": ['Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3', 'Molmo_7B_D', "Aquila_VL_2B", "POINTS_7B", "Qwen2_VL_2B", "InternVL2_2B", "InternVL2_5_2B", "InternVL2_5_8B"]
static/eval_results/Default/Aquila_VL_2B/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/Default/Aria/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/Default/Claude_3.5/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/Default/Claude_3.5_new/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/Default/GPT_4o/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/Default/GPT_4o_mini/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/Default/Gemini_1.5_flash_002/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/Default/Gemini_1.5_pro_002/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/Default/Idefics3/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/Default/InternVL2_2B/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/Default/InternVL2_5_2B/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/Default/InternVL2_5_78B/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/Default/InternVL2_5_8B/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/Default/InternVL2_76B/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/Default/InternVL2_8B/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/Default/Llama_3_2_11B/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/Default/Mammoth_VL/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/Default/MiniCPM_v2.6/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/Default/NVLM/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/Default/Phi-3.5-vision/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/Default/Pixtral_12B/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/Default/Qwen2_VL_2B/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/Default/Qwen2_VL_72B/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/Default/Qwen2_VL_7B/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/Default/llava_onevision_72B/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/Default/llava_onevision_7B/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/SI/Aquila_VL_2B/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/SI/Aria/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/SI/Claude_3.5/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/SI/Claude_3.5_new/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/SI/GPT_4o/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/SI/GPT_4o_mini/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/SI/Gemini_1.5_flash_002/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/SI/Gemini_1.5_pro_002/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/SI/Grok-2-vision-1212/summary_and_keyword_stats.json ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_summary": {
3
+ "core": {
4
+ "num_eval_tasks": 273,
5
+ "num_eval_samples": 4108,
6
+ "macro_mean_score": 0.4120738315897316
7
+ },
8
+ "open": {
9
+ "num_eval_tasks": 42,
10
+ "num_eval_samples": 808,
11
+ "macro_mean_score": 0.5369427320775519
12
+ },
13
+ "overall_score": 0.42872301832144094
14
+ },
15
+ "keyword_stats": {
16
+ "skills": {
17
+ "Object Recognition and Classification": {
18
+ "count": 172,
19
+ "num_samples": 2704,
20
+ "tasks": [],
21
+ "average_score": 0.466222487838683
22
+ },
23
+ "Language Understanding and Generation": {
24
+ "count": 102,
25
+ "num_samples": 1707,
26
+ "tasks": [],
27
+ "average_score": 0.49260409084481493
28
+ },
29
+ "Commonsense and Social Reasoning": {
30
+ "count": 38,
31
+ "num_samples": 652,
32
+ "tasks": [],
33
+ "average_score": 0.5513856107049714
34
+ },
35
+ "Scene and Event Understanding": {
36
+ "count": 60,
37
+ "num_samples": 1004,
38
+ "tasks": [],
39
+ "average_score": 0.5869208042949662
40
+ },
41
+ "Domain-Specific Knowledge and Skills": {
42
+ "count": 46,
43
+ "num_samples": 896,
44
+ "tasks": [],
45
+ "average_score": 0.4815724520339999
46
+ },
47
+ "Ethical and Safety Reasoning": {
48
+ "count": 10,
49
+ "num_samples": 170,
50
+ "tasks": [],
51
+ "average_score": 0.6636804511278196
52
+ },
53
+ "Text Recognition (OCR)": {
54
+ "count": 101,
55
+ "num_samples": 1680,
56
+ "tasks": [],
57
+ "average_score": 0.3702735127125422
58
+ },
59
+ "Spatial and Temporal Reasoning": {
60
+ "count": 78,
61
+ "num_samples": 1270,
62
+ "tasks": [],
63
+ "average_score": 0.33515724252578744
64
+ },
65
+ "Mathematical and Logical Reasoning": {
66
+ "count": 91,
67
+ "num_samples": 1628,
68
+ "tasks": [],
69
+ "average_score": 0.38305225927176256
70
+ },
71
+ "Planning and Decision Making": {
72
+ "count": 23,
73
+ "num_samples": 355,
74
+ "tasks": [],
75
+ "average_score": 0.10535044936296332
76
+ }
77
+ },
78
+ "input_format": {
79
+ "Photographs": {
80
+ "count": 83,
81
+ "num_samples": 1310,
82
+ "tasks": [],
83
+ "average_score": 0.5618064274876843
84
+ },
85
+ "Artistic and Creative Content": {
86
+ "count": 22,
87
+ "num_samples": 388,
88
+ "tasks": [],
89
+ "average_score": 0.5826343022222362
90
+ },
91
+ "Diagrams and Data Visualizations": {
92
+ "count": 88,
93
+ "num_samples": 1523,
94
+ "tasks": [],
95
+ "average_score": 0.46202217635966664
96
+ },
97
+ "Text-Based Images and Documents": {
98
+ "count": 53,
99
+ "num_samples": 847,
100
+ "tasks": [],
101
+ "average_score": 0.30345879283667027
102
+ },
103
+ "User Interface Screenshots": {
104
+ "count": 67,
105
+ "num_samples": 1117,
106
+ "tasks": [],
107
+ "average_score": 0.2751054353728693
108
+ },
109
+ "3D Models and Aerial Imagery": {
110
+ "count": 2,
111
+ "num_samples": 30,
112
+ "tasks": [],
113
+ "average_score": 0.21326546545524588
114
+ }
115
+ },
116
+ "output_format": {
117
+ "contextual_formatted_text": {
118
+ "count": 63,
119
+ "num_samples": 972,
120
+ "tasks": [],
121
+ "average_score": 0.36955979847719544
122
+ },
123
+ "open_ended_output": {
124
+ "count": 51,
125
+ "num_samples": 986,
126
+ "tasks": [],
127
+ "average_score": 0.531045897627514
128
+ },
129
+ "structured_output": {
130
+ "count": 72,
131
+ "num_samples": 1120,
132
+ "tasks": [],
133
+ "average_score": 0.39618293480240524
134
+ },
135
+ "numerical_data": {
136
+ "count": 39,
137
+ "num_samples": 694,
138
+ "tasks": [],
139
+ "average_score": 0.4022896767902012
140
+ },
141
+ "multiple_choice": {
142
+ "count": 33,
143
+ "num_samples": 567,
144
+ "tasks": [],
145
+ "average_score": 0.5637194455376273
146
+ },
147
+ "exact_text": {
148
+ "count": 57,
149
+ "num_samples": 876,
150
+ "tasks": [],
151
+ "average_score": 0.3835953032430645
152
+ }
153
+ },
154
+ "input_num": {
155
+ "1-image": {
156
+ "count": 315,
157
+ "num_samples": 5215,
158
+ "tasks": [],
159
+ "average_score": 0.4287230183214409
160
+ }
161
+ },
162
+ "app": {
163
+ "Knowledge": {
164
+ "count": 77,
165
+ "num_samples": 1291,
166
+ "tasks": [],
167
+ "average_score": 0.5298084871907297
168
+ },
169
+ "Perception": {
170
+ "count": 82,
171
+ "num_samples": 1318,
172
+ "tasks": [],
173
+ "average_score": 0.5357263973810524
174
+ },
175
+ "Coding": {
176
+ "count": 16,
177
+ "num_samples": 244,
178
+ "tasks": [],
179
+ "average_score": 0.4783708274976657
180
+ },
181
+ "Science": {
182
+ "count": 22,
183
+ "num_samples": 469,
184
+ "tasks": [],
185
+ "average_score": 0.4448688427088975
186
+ },
187
+ "Information_Extraction": {
188
+ "count": 41,
189
+ "num_samples": 639,
190
+ "tasks": [],
191
+ "average_score": 0.312597090907984
192
+ },
193
+ "Planning": {
194
+ "count": 44,
195
+ "num_samples": 712,
196
+ "tasks": [],
197
+ "average_score": 0.18803058075452733
198
+ },
199
+ "Mathematics": {
200
+ "count": 30,
201
+ "num_samples": 497,
202
+ "tasks": [],
203
+ "average_score": 0.35624322358581967
204
+ },
205
+ "Metrics": {
206
+ "count": 3,
207
+ "num_samples": 45,
208
+ "tasks": [],
209
+ "average_score": 0.3682539682539683
210
+ }
211
+ }
212
+ }
213
+ }
static/eval_results/SI/Grok-2-vision-1212/task_results.json ADDED
@@ -0,0 +1,2207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "name": "ascii_art_30",
4
+ "score": 0.14285714285714285,
5
+ "eval_type": "llm",
6
+ "num_demo": 1,
7
+ "num_query": 14
8
+ },
9
+ {
10
+ "name": "humor_explanation",
11
+ "score": 0.8600000000000001,
12
+ "eval_type": "llm",
13
+ "num_demo": 1,
14
+ "num_query": 15
15
+ },
16
+ {
17
+ "name": "science_figure_explanation",
18
+ "score": 0.8448275862068965,
19
+ "eval_type": "llm",
20
+ "num_demo": 1,
21
+ "num_query": 29
22
+ },
23
+ {
24
+ "name": "vibe_eval_phrase",
25
+ "score": 0.6357142857142858,
26
+ "eval_type": "llm",
27
+ "num_demo": 1,
28
+ "num_query": 14
29
+ },
30
+ {
31
+ "name": "traffic_accident_analysis",
32
+ "score": 0.39999999999999997,
33
+ "eval_type": "llm",
34
+ "num_demo": 1,
35
+ "num_query": 14
36
+ },
37
+ {
38
+ "name": "figurative_speech_explanation",
39
+ "score": 0.8517241379310343,
40
+ "eval_type": "llm",
41
+ "num_demo": 1,
42
+ "num_query": 29
43
+ },
44
+ {
45
+ "name": "table2latex_complex",
46
+ "score": 0.611111111111111,
47
+ "eval_type": "llm",
48
+ "num_demo": 1,
49
+ "num_query": 9
50
+ },
51
+ {
52
+ "name": "unusual_images",
53
+ "score": 0.8275862068965517,
54
+ "eval_type": "llm",
55
+ "num_demo": 1,
56
+ "num_query": 29
57
+ },
58
+ {
59
+ "name": "art_explanation",
60
+ "score": 0.7689655172413796,
61
+ "eval_type": "llm",
62
+ "num_demo": 1,
63
+ "num_query": 29
64
+ },
65
+ {
66
+ "name": "ocr_open_ended_qa",
67
+ "score": 0.8172413793103449,
68
+ "eval_type": "llm",
69
+ "num_demo": 1,
70
+ "num_query": 29
71
+ },
72
+ {
73
+ "name": "bar_chart_interpretation",
74
+ "score": 0.5310344827586206,
75
+ "eval_type": "llm",
76
+ "num_demo": 1,
77
+ "num_query": 29
78
+ },
79
+ {
80
+ "name": "scibench_w_solution_open_ended",
81
+ "score": 0.39,
82
+ "eval_type": "llm",
83
+ "num_demo": 1,
84
+ "num_query": 25
85
+ },
86
+ {
87
+ "name": "GUI_Chat_Hard",
88
+ "score": 0.4538461538461539,
89
+ "eval_type": "llm",
90
+ "num_demo": 1,
91
+ "num_query": 26
92
+ },
93
+ {
94
+ "name": "image_humor_understanding",
95
+ "score": 0.886206896551724,
96
+ "eval_type": "llm",
97
+ "num_demo": 1,
98
+ "num_query": 29
99
+ },
100
+ {
101
+ "name": "defeasible_reasoning",
102
+ "score": 0.8655172413793104,
103
+ "eval_type": "llm",
104
+ "num_demo": 1,
105
+ "num_query": 29
106
+ },
107
+ {
108
+ "name": "funny_image_title",
109
+ "score": 0.6357142857142858,
110
+ "eval_type": "llm",
111
+ "num_demo": 1,
112
+ "num_query": 14
113
+ },
114
+ {
115
+ "name": "tweets_captioning",
116
+ "score": 0.6214285714285716,
117
+ "eval_type": "llm",
118
+ "num_demo": 1,
119
+ "num_query": 14
120
+ },
121
+ {
122
+ "name": "graph_interpretation",
123
+ "score": 0.7896551724137929,
124
+ "eval_type": "llm",
125
+ "num_demo": 1,
126
+ "num_query": 29
127
+ },
128
+ {
129
+ "name": "meme_explain",
130
+ "score": 0.892857142857143,
131
+ "eval_type": "llm",
132
+ "num_demo": 1,
133
+ "num_query": 14
134
+ },
135
+ {
136
+ "name": "guess_image_generation_prompt",
137
+ "score": 0.7368421052631581,
138
+ "eval_type": "llm",
139
+ "num_demo": 1,
140
+ "num_query": 19
141
+ },
142
+ {
143
+ "name": "visualization_with_code",
144
+ "score": 0.5142857142857142,
145
+ "eval_type": "llm",
146
+ "num_demo": 1,
147
+ "num_query": 14
148
+ },
149
+ {
150
+ "name": "iq_test_open_ended",
151
+ "score": 0.42758620689655175,
152
+ "eval_type": "llm",
153
+ "num_demo": 1,
154
+ "num_query": 29
155
+ },
156
+ {
157
+ "name": "electrocardiogram",
158
+ "score": 0.2785714285714285,
159
+ "eval_type": "llm",
160
+ "num_demo": 1,
161
+ "num_query": 14
162
+ },
163
+ {
164
+ "name": "image_captioning_with_additional_requirements",
165
+ "score": 0.8428571428571432,
166
+ "eval_type": "llm",
167
+ "num_demo": 1,
168
+ "num_query": 14
169
+ },
170
+ {
171
+ "name": "docci_image_description_long",
172
+ "score": 0.7142857142857143,
173
+ "eval_type": "llm",
174
+ "num_demo": 1,
175
+ "num_query": 14
176
+ },
177
+ {
178
+ "name": "GUI_Chat_Easy",
179
+ "score": 0.5923076923076924,
180
+ "eval_type": "llm",
181
+ "num_demo": 1,
182
+ "num_query": 26
183
+ },
184
+ {
185
+ "name": "bridge_strategies_advanced",
186
+ "score": 0.20714285714285716,
187
+ "eval_type": "llm",
188
+ "num_demo": 1,
189
+ "num_query": 14
190
+ },
191
+ {
192
+ "name": "bridge_strategies_worldclass",
193
+ "score": 0.1928571428571429,
194
+ "eval_type": "llm",
195
+ "num_demo": 1,
196
+ "num_query": 14
197
+ },
198
+ {
199
+ "name": "bridge_strategies_expert",
200
+ "score": 0.3642857142857142,
201
+ "eval_type": "llm",
202
+ "num_demo": 1,
203
+ "num_query": 14
204
+ },
205
+ {
206
+ "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
207
+ "score": 0.12857142857142856,
208
+ "eval_type": "llm",
209
+ "num_demo": 1,
210
+ "num_query": 14
211
+ },
212
+ {
213
+ "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
214
+ "score": 0.09285714285714286,
215
+ "eval_type": "llm",
216
+ "num_demo": 1,
217
+ "num_query": 14
218
+ },
219
+ {
220
+ "name": "multi_lingual_Ruozhiba_expalnation_French",
221
+ "score": 0.09999999999999999,
222
+ "eval_type": "llm",
223
+ "num_demo": 1,
224
+ "num_query": 14
225
+ },
226
+ {
227
+ "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
228
+ "score": 0.13571428571428573,
229
+ "eval_type": "llm",
230
+ "num_demo": 1,
231
+ "num_query": 14
232
+ },
233
+ {
234
+ "name": "multi_lingual_Ruozhiba_expalnation_Russian",
235
+ "score": 0.07142857142857142,
236
+ "eval_type": "llm",
237
+ "num_demo": 1,
238
+ "num_query": 14
239
+ },
240
+ {
241
+ "name": "multi_lingual_Ruozhiba_expalnation_English",
242
+ "score": 0.07142857142857142,
243
+ "eval_type": "llm",
244
+ "num_demo": 1,
245
+ "num_query": 14
246
+ },
247
+ {
248
+ "name": "table_understanding_fetaqa",
249
+ "score": 0.4142857142857143,
250
+ "eval_type": "llm",
251
+ "num_demo": 1,
252
+ "num_query": 14
253
+ },
254
+ {
255
+ "name": "red_teaming_celebrity",
256
+ "score": 0.8250000000000002,
257
+ "eval_type": "llm",
258
+ "num_demo": 0,
259
+ "num_query": 20
260
+ },
261
+ {
262
+ "name": "red_teaming_captcha",
263
+ "score": 0.12105263157894738,
264
+ "eval_type": "llm",
265
+ "num_demo": 1,
266
+ "num_query": 19
267
+ },
268
+ {
269
+ "name": "red_teaming_jailbreak",
270
+ "score": 0.575,
271
+ "eval_type": "llm",
272
+ "num_demo": 0,
273
+ "num_query": 20
274
+ },
275
+ {
276
+ "name": "red_teaming_visualmisleading",
277
+ "score": 0.8789473684210528,
278
+ "eval_type": "llm",
279
+ "num_demo": 1,
280
+ "num_query": 19
281
+ },
282
+ {
283
+ "name": "red_teaming_racial",
284
+ "score": 0.7250000000000001,
285
+ "eval_type": "llm",
286
+ "num_demo": 0,
287
+ "num_query": 20
288
+ },
289
+ {
290
+ "name": "red_teaming_politics",
291
+ "score": 0.7150000000000001,
292
+ "eval_type": "llm",
293
+ "num_demo": 0,
294
+ "num_query": 20
295
+ },
296
+ {
297
+ "name": "brand_logo_recognition_and_elaboration",
298
+ "score": 0.78,
299
+ "eval_type": "rule",
300
+ "num_demo": 1,
301
+ "num_query": 25
302
+ },
303
+ {
304
+ "name": "exchange_rate_estimate_plot",
305
+ "score": 0.9693285714285712,
306
+ "eval_type": "rule",
307
+ "num_demo": 1,
308
+ "num_query": 14
309
+ },
310
+ {
311
+ "name": "math_parity",
312
+ "score": 0.8,
313
+ "eval_type": "rule",
314
+ "num_demo": 1,
315
+ "num_query": 15
316
+ },
317
+ {
318
+ "name": "traffic_future_prediction_from_line_plot",
319
+ "score": 0.6339473684210527,
320
+ "eval_type": "rule",
321
+ "num_demo": 1,
322
+ "num_query": 19
323
+ },
324
+ {
325
+ "name": "graph_chordless_cycle",
326
+ "score": 0.21428571428571427,
327
+ "eval_type": "rule",
328
+ "num_demo": 1,
329
+ "num_query": 14
330
+ },
331
+ {
332
+ "name": "youtube_video_info_parsing",
333
+ "score": 0.1547619047619048,
334
+ "eval_type": "rule",
335
+ "num_demo": 1,
336
+ "num_query": 14
337
+ },
338
+ {
339
+ "name": "super_clevr_scene_understanding",
340
+ "score": 0.5,
341
+ "eval_type": "rule",
342
+ "num_demo": 1,
343
+ "num_query": 14
344
+ },
345
+ {
346
+ "name": "figureqa",
347
+ "score": 0.6428571428571429,
348
+ "eval_type": "rule",
349
+ "num_demo": 1,
350
+ "num_query": 14
351
+ },
352
+ {
353
+ "name": "face_keypoint_detection",
354
+ "score": 0.5273791704902433,
355
+ "eval_type": "rule",
356
+ "num_demo": 1,
357
+ "num_query": 14
358
+ },
359
+ {
360
+ "name": "widerface_face_count_and_event_classification",
361
+ "score": 0.6071428571428571,
362
+ "eval_type": "rule",
363
+ "num_demo": 1,
364
+ "num_query": 14
365
+ },
366
+ {
367
+ "name": "average_humidity_estimate_plot",
368
+ "score": 0.764,
369
+ "eval_type": "rule",
370
+ "num_demo": 1,
371
+ "num_query": 15
372
+ },
373
+ {
374
+ "name": "weather_info_parsing",
375
+ "score": 0.5277777777777779,
376
+ "eval_type": "rule",
377
+ "num_demo": 1,
378
+ "num_query": 14
379
+ },
380
+ {
381
+ "name": "egocentric_analysis_single_image",
382
+ "score": 0.4444444444444444,
383
+ "eval_type": "rule",
384
+ "num_demo": 1,
385
+ "num_query": 9
386
+ },
387
+ {
388
+ "name": "waybill_number_sequence_extraction",
389
+ "score": 0.2857142857142857,
390
+ "eval_type": "rule",
391
+ "num_demo": 1,
392
+ "num_query": 14
393
+ },
394
+ {
395
+ "name": "graph_maxflow",
396
+ "score": 0.2,
397
+ "eval_type": "rule",
398
+ "num_demo": 1,
399
+ "num_query": 15
400
+ },
401
+ {
402
+ "name": "TV_show_info_parsing",
403
+ "score": 0.3412698412698412,
404
+ "eval_type": "rule",
405
+ "num_demo": 1,
406
+ "num_query": 14
407
+ },
408
+ {
409
+ "name": "insect_order_classification",
410
+ "score": 0.3333333333333333,
411
+ "eval_type": "rule",
412
+ "num_demo": 1,
413
+ "num_query": 15
414
+ },
415
+ {
416
+ "name": "electricity_plot_future_prediction",
417
+ "score": 0.806521052631579,
418
+ "eval_type": "rule",
419
+ "num_demo": 1,
420
+ "num_query": 19
421
+ },
422
+ {
423
+ "name": "chemistry_exams_v",
424
+ "score": 0.14285714285714285,
425
+ "eval_type": "rule",
426
+ "num_demo": 1,
427
+ "num_query": 14
428
+ },
429
+ {
430
+ "name": "finance_table_understanding",
431
+ "score": 0.42857142857142855,
432
+ "eval_type": "rule",
433
+ "num_demo": 1,
434
+ "num_query": 14
435
+ },
436
+ {
437
+ "name": "funsd_document_qa",
438
+ "score": 0.21428571428571427,
439
+ "eval_type": "rule",
440
+ "num_demo": 1,
441
+ "num_query": 14
442
+ },
443
+ {
444
+ "name": "vibe_eval_open",
445
+ "score": 0.14285714285714285,
446
+ "eval_type": "rule",
447
+ "num_demo": 1,
448
+ "num_query": 14
449
+ },
450
+ {
451
+ "name": "question_solution_solving",
452
+ "score": 0.07142857142857142,
453
+ "eval_type": "rule",
454
+ "num_demo": 1,
455
+ "num_query": 14
456
+ },
457
+ {
458
+ "name": "graph_theory",
459
+ "score": 0.35714285714285715,
460
+ "eval_type": "rule",
461
+ "num_demo": 1,
462
+ "num_query": 14
463
+ },
464
+ {
465
+ "name": "geometry_analytic",
466
+ "score": 0.21428571428571427,
467
+ "eval_type": "rule",
468
+ "num_demo": 1,
469
+ "num_query": 14
470
+ },
471
+ {
472
+ "name": "geometry_length",
473
+ "score": 0.35714285714285715,
474
+ "eval_type": "rule",
475
+ "num_demo": 1,
476
+ "num_query": 14
477
+ },
478
+ {
479
+ "name": "algebra",
480
+ "score": 0.2857142857142857,
481
+ "eval_type": "rule",
482
+ "num_demo": 1,
483
+ "num_query": 14
484
+ },
485
+ {
486
+ "name": "chess_puzzle_single_step",
487
+ "score": 0.0,
488
+ "eval_type": "rule",
489
+ "num_demo": 1,
490
+ "num_query": 15
491
+ },
492
+ {
493
+ "name": "chess_winner_identification",
494
+ "score": 0.3333333333333333,
495
+ "eval_type": "rule",
496
+ "num_demo": 1,
497
+ "num_query": 15
498
+ },
499
+ {
500
+ "name": "physical_property_reasoning",
501
+ "score": 0.9285714285714286,
502
+ "eval_type": "rule",
503
+ "num_demo": 1,
504
+ "num_query": 14
505
+ },
506
+ {
507
+ "name": "humor_understand_caption_match",
508
+ "score": 0.6666666666666666,
509
+ "eval_type": "rule",
510
+ "num_demo": 1,
511
+ "num_query": 15
512
+ },
513
+ {
514
+ "name": "coco_object_detection_by_query_property",
515
+ "score": 0.6660750158426613,
516
+ "eval_type": "rule",
517
+ "num_demo": 1,
518
+ "num_query": 14
519
+ },
520
+ {
521
+ "name": "multilingual_game_info_parsing",
522
+ "score": 0.26785714285714285,
523
+ "eval_type": "rule",
524
+ "num_demo": 1,
525
+ "num_query": 14
526
+ },
527
+ {
528
+ "name": "mnist_pattern",
529
+ "score": 0.0,
530
+ "eval_type": "rule",
531
+ "num_demo": 1,
532
+ "num_query": 14
533
+ },
534
+ {
535
+ "name": "dvqa",
536
+ "score": 0.9473684210526315,
537
+ "eval_type": "rule",
538
+ "num_demo": 1,
539
+ "num_query": 19
540
+ },
541
+ {
542
+ "name": "physics_exams_v",
543
+ "score": 0.5,
544
+ "eval_type": "rule",
545
+ "num_demo": 1,
546
+ "num_query": 14
547
+ },
548
+ {
549
+ "name": "snli_ve_visual_entailment",
550
+ "score": 0.7333333333333333,
551
+ "eval_type": "rule",
552
+ "num_demo": 1,
553
+ "num_query": 15
554
+ },
555
+ {
556
+ "name": "3d_indoor_scene_text_bbox_selection",
557
+ "score": 0.2857142857142857,
558
+ "eval_type": "rule",
559
+ "num_demo": 1,
560
+ "num_query": 14
561
+ },
562
+ {
563
+ "name": "geometry_descriptive",
564
+ "score": 0.14285714285714285,
565
+ "eval_type": "rule",
566
+ "num_demo": 1,
567
+ "num_query": 14
568
+ },
569
+ {
570
+ "name": "top_rated_hotel_identification",
571
+ "score": 0.0,
572
+ "eval_type": "rule",
573
+ "num_demo": 1,
574
+ "num_query": 14
575
+ },
576
+ {
577
+ "name": "science_molecule_chemistry",
578
+ "score": 0.8,
579
+ "eval_type": "rule",
580
+ "num_demo": 1,
581
+ "num_query": 15
582
+ },
583
+ {
584
+ "name": "game_info_parsing",
585
+ "score": 0.5064935064935063,
586
+ "eval_type": "rule",
587
+ "num_demo": 1,
588
+ "num_query": 14
589
+ },
590
+ {
591
+ "name": "deciphering_oracle_bone",
592
+ "score": 0.0,
593
+ "eval_type": "rule",
594
+ "num_demo": 1,
595
+ "num_query": 14
596
+ },
597
+ {
598
+ "name": "signboard_identification",
599
+ "score": 0.5,
600
+ "eval_type": "rule",
601
+ "num_demo": 1,
602
+ "num_query": 14
603
+ },
604
+ {
605
+ "name": "image_style_recognition",
606
+ "score": 1.0,
607
+ "eval_type": "rule",
608
+ "num_demo": 1,
609
+ "num_query": 14
610
+ },
611
+ {
612
+ "name": "math_convexity_value_estimation",
613
+ "score": 0.5147224146995476,
614
+ "eval_type": "rule",
615
+ "num_demo": 1,
616
+ "num_query": 15
617
+ },
618
+ {
619
+ "name": "3d_indoor_scene_text_bbox_prediction",
620
+ "score": 0.14081664519620604,
621
+ "eval_type": "rule",
622
+ "num_demo": 1,
623
+ "num_query": 14
624
+ },
625
+ {
626
+ "name": "movie_info_parsing",
627
+ "score": 0.3125,
628
+ "eval_type": "rule",
629
+ "num_demo": 1,
630
+ "num_query": 14
631
+ },
632
+ {
633
+ "name": "human_relationship_reasoning",
634
+ "score": 1.0,
635
+ "eval_type": "rule",
636
+ "num_demo": 1,
637
+ "num_query": 14
638
+ },
639
+ {
640
+ "name": "graph_shortest_path_kamada_kawai",
641
+ "score": 0.07142857142857142,
642
+ "eval_type": "rule",
643
+ "num_demo": 1,
644
+ "num_query": 14
645
+ },
646
+ {
647
+ "name": "coco_person_detection",
648
+ "score": 0.6271347226619672,
649
+ "eval_type": "rule",
650
+ "num_demo": 1,
651
+ "num_query": 14
652
+ },
653
+ {
654
+ "name": "chart_vqa",
655
+ "score": 0.5714285714285714,
656
+ "eval_type": "rule",
657
+ "num_demo": 1,
658
+ "num_query": 14
659
+ },
660
+ {
661
+ "name": "nlvr2_two_image_compare_qa",
662
+ "score": 0.7857142857142857,
663
+ "eval_type": "rule",
664
+ "num_demo": 1,
665
+ "num_query": 14
666
+ },
667
+ {
668
+ "name": "math_exams_v",
669
+ "score": 0.6428571428571429,
670
+ "eval_type": "rule",
671
+ "num_demo": 1,
672
+ "num_query": 14
673
+ },
674
+ {
675
+ "name": "newspaper_ocr_in_query_box",
676
+ "score": 0.13333333333333333,
677
+ "eval_type": "rule",
678
+ "num_demo": 1,
679
+ "num_query": 15
680
+ },
681
+ {
682
+ "name": "mvsa_sentiment_classification",
683
+ "score": 0.6428571428571429,
684
+ "eval_type": "rule",
685
+ "num_demo": 1,
686
+ "num_query": 14
687
+ },
688
+ {
689
+ "name": "egocentric_spatial_reasoning",
690
+ "score": 0.4444444444444444,
691
+ "eval_type": "rule",
692
+ "num_demo": 1,
693
+ "num_query": 9
694
+ },
695
+ {
696
+ "name": "graph_isomorphism",
697
+ "score": 0.6,
698
+ "eval_type": "rule",
699
+ "num_demo": 1,
700
+ "num_query": 15
701
+ },
702
+ {
703
+ "name": "code_programming_test_easy",
704
+ "score": 0.2708333333333333,
705
+ "eval_type": "rule",
706
+ "num_demo": 1,
707
+ "num_query": 24
708
+ },
709
+ {
710
+ "name": "biology_exams_v",
711
+ "score": 0.2857142857142857,
712
+ "eval_type": "rule",
713
+ "num_demo": 1,
714
+ "num_query": 14
715
+ },
716
+ {
717
+ "name": "long_string_number_recognition",
718
+ "score": 0.07142857142857142,
719
+ "eval_type": "rule",
720
+ "num_demo": 1,
721
+ "num_query": 14
722
+ },
723
+ {
724
+ "name": "kvqa_knowledge_aware_qa",
725
+ "score": 0.47368421052631576,
726
+ "eval_type": "rule",
727
+ "num_demo": 1,
728
+ "num_query": 19
729
+ },
730
+ {
731
+ "name": "math_breakpoint",
732
+ "score": 0.5333333333333333,
733
+ "eval_type": "rule",
734
+ "num_demo": 1,
735
+ "num_query": 15
736
+ },
737
+ {
738
+ "name": "landmark_recognition_and_qa",
739
+ "score": 0.5555555555555555,
740
+ "eval_type": "rule",
741
+ "num_demo": 1,
742
+ "num_query": 15
743
+ },
744
+ {
745
+ "name": "map_diagram_qa",
746
+ "score": 0.5714285714285714,
747
+ "eval_type": "rule",
748
+ "num_demo": 1,
749
+ "num_query": 14
750
+ },
751
+ {
752
+ "name": "pmc_vqa_medical_image_qa",
753
+ "score": 0.8421052631578947,
754
+ "eval_type": "rule",
755
+ "num_demo": 1,
756
+ "num_query": 19
757
+ },
758
+ {
759
+ "name": "newspaper_page_parse_and_count",
760
+ "score": 0.5777777777777776,
761
+ "eval_type": "rule",
762
+ "num_demo": 1,
763
+ "num_query": 15
764
+ },
765
+ {
766
+ "name": "science_basic_physics",
767
+ "score": 0.8,
768
+ "eval_type": "rule",
769
+ "num_demo": 1,
770
+ "num_query": 15
771
+ },
772
+ {
773
+ "name": "electricity_future_prediction_from_table",
774
+ "score": 0.568157894736842,
775
+ "eval_type": "rule",
776
+ "num_demo": 1,
777
+ "num_query": 19
778
+ },
779
+ {
780
+ "name": "license_plate_recognition",
781
+ "score": 0.6428571428571429,
782
+ "eval_type": "rule",
783
+ "num_demo": 1,
784
+ "num_query": 14
785
+ },
786
+ {
787
+ "name": "places365_scene_type_classification",
788
+ "score": 0.9285714285714286,
789
+ "eval_type": "rule",
790
+ "num_demo": 1,
791
+ "num_query": 14
792
+ },
793
+ {
794
+ "name": "music_info_parsing",
795
+ "score": 0.3482142857142857,
796
+ "eval_type": "rule",
797
+ "num_demo": 1,
798
+ "num_query": 14
799
+ },
800
+ {
801
+ "name": "multilingual_movie_info_parsing",
802
+ "score": 0.2040816326530612,
803
+ "eval_type": "rule",
804
+ "num_demo": 1,
805
+ "num_query": 14
806
+ },
807
+ {
808
+ "name": "iconqa_count_and_reasoning",
809
+ "score": 0.5263157894736842,
810
+ "eval_type": "rule",
811
+ "num_demo": 1,
812
+ "num_query": 19
813
+ },
814
+ {
815
+ "name": "graph_connectivity",
816
+ "score": 0.5833333333333334,
817
+ "eval_type": "rule",
818
+ "num_demo": 1,
819
+ "num_query": 15
820
+ },
821
+ {
822
+ "name": "graph_shortest_path_planar",
823
+ "score": 0.14285714285714285,
824
+ "eval_type": "rule",
825
+ "num_demo": 1,
826
+ "num_query": 14
827
+ },
828
+ {
829
+ "name": "famous_building_recognition",
830
+ "score": 0.84375,
831
+ "eval_type": "rule",
832
+ "num_demo": 1,
833
+ "num_query": 16
834
+ },
835
+ {
836
+ "name": "geometry_transformation",
837
+ "score": 0.07142857142857142,
838
+ "eval_type": "rule",
839
+ "num_demo": 1,
840
+ "num_query": 14
841
+ },
842
+ {
843
+ "name": "long_string_letter_recognition",
844
+ "score": 0.0,
845
+ "eval_type": "rule",
846
+ "num_demo": 1,
847
+ "num_query": 14
848
+ },
849
+ {
850
+ "name": "handwritten_math_expression_extraction",
851
+ "score": 0.5,
852
+ "eval_type": "rule",
853
+ "num_demo": 1,
854
+ "num_query": 14
855
+ },
856
+ {
857
+ "name": "geometry_solid",
858
+ "score": 0.07142857142857142,
859
+ "eval_type": "rule",
860
+ "num_demo": 1,
861
+ "num_query": 14
862
+ },
863
+ {
864
+ "name": "animal_pose_estimation",
865
+ "score": 0.29270584066940136,
866
+ "eval_type": "rule",
867
+ "num_demo": 1,
868
+ "num_query": 14
869
+ },
870
+ {
871
+ "name": "single_person_pose_estimation",
872
+ "score": 0.36770882803106975,
873
+ "eval_type": "rule",
874
+ "num_demo": 1,
875
+ "num_query": 14
876
+ },
877
+ {
878
+ "name": "geometry_area",
879
+ "score": 0.21428571428571427,
880
+ "eval_type": "rule",
881
+ "num_demo": 1,
882
+ "num_query": 14
883
+ },
884
+ {
885
+ "name": "hotel_booking_confirmation_parsing",
886
+ "score": 0.1,
887
+ "eval_type": "rule",
888
+ "num_demo": 1,
889
+ "num_query": 14
890
+ },
891
+ {
892
+ "name": "ili_ratio_future_prediction",
893
+ "score": 0.0,
894
+ "eval_type": "rule",
895
+ "num_demo": 1,
896
+ "num_query": 14
897
+ },
898
+ {
899
+ "name": "electricity_load_estimate_plot",
900
+ "score": 0.6239285714285715,
901
+ "eval_type": "rule",
902
+ "num_demo": 1,
903
+ "num_query": 14
904
+ },
905
+ {
906
+ "name": "tqa_textbook_qa",
907
+ "score": 0.8571428571428571,
908
+ "eval_type": "rule",
909
+ "num_demo": 1,
910
+ "num_query": 14
911
+ },
912
+ {
913
+ "name": "stock_info_parsing",
914
+ "score": 0.21428571428571433,
915
+ "eval_type": "rule",
916
+ "num_demo": 1,
917
+ "num_query": 14
918
+ },
919
+ {
920
+ "name": "quizlet_question_solving",
921
+ "score": 0.35714285714285715,
922
+ "eval_type": "rule",
923
+ "num_demo": 1,
924
+ "num_query": 14
925
+ },
926
+ {
927
+ "name": "stock_price_future_prediction",
928
+ "score": 0.46871428571428575,
929
+ "eval_type": "rule",
930
+ "num_demo": 1,
931
+ "num_query": 14
932
+ },
933
+ {
934
+ "name": "Ad_count_detection",
935
+ "score": 0.35714285714285715,
936
+ "eval_type": "rule",
937
+ "num_demo": 1,
938
+ "num_query": 14
939
+ },
940
+ {
941
+ "name": "recover_masked_word_in_figure",
942
+ "score": 0.07142857142857142,
943
+ "eval_type": "rule",
944
+ "num_demo": 1,
945
+ "num_query": 14
946
+ },
947
+ {
948
+ "name": "polygon_interior_angles",
949
+ "score": 0.0,
950
+ "eval_type": "rule",
951
+ "num_demo": 1,
952
+ "num_query": 14
953
+ },
954
+ {
955
+ "name": "web_action_grounding",
956
+ "score": 0.07142857142857142,
957
+ "eval_type": "rule",
958
+ "num_demo": 1,
959
+ "num_query": 14
960
+ },
961
+ {
962
+ "name": "latex_complex_formula_convertion",
963
+ "score": 0.35294117647058826,
964
+ "eval_type": "rule",
965
+ "num_demo": 1,
966
+ "num_query": 17
967
+ },
968
+ {
969
+ "name": "transit_map_intersection_points",
970
+ "score": 0.2261904761904762,
971
+ "eval_type": "rule",
972
+ "num_demo": 1,
973
+ "num_query": 14
974
+ },
975
+ {
976
+ "name": "arxiv_vqa",
977
+ "score": 0.6428571428571429,
978
+ "eval_type": "rule",
979
+ "num_demo": 1,
980
+ "num_query": 14
981
+ },
982
+ {
983
+ "name": "medical_image_artifacts_indentification",
984
+ "score": 0.14285714285714285,
985
+ "eval_type": "rule",
986
+ "num_demo": 1,
987
+ "num_query": 14
988
+ },
989
+ {
990
+ "name": "song_title_identification_from_lyrics",
991
+ "score": 0.6071428571428571,
992
+ "eval_type": "rule",
993
+ "num_demo": 1,
994
+ "num_query": 14
995
+ },
996
+ {
997
+ "name": "actor_recognition_in_Movie",
998
+ "score": 0.7142857142857143,
999
+ "eval_type": "rule",
1000
+ "num_demo": 1,
1001
+ "num_query": 14
1002
+ },
1003
+ {
1004
+ "name": "bongard_problem",
1005
+ "score": 0.21052631578947367,
1006
+ "eval_type": "rule",
1007
+ "num_demo": 1,
1008
+ "num_query": 19
1009
+ },
1010
+ {
1011
+ "name": "ascii_art_understanding",
1012
+ "score": 0.5,
1013
+ "eval_type": "rule",
1014
+ "num_demo": 1,
1015
+ "num_query": 14
1016
+ },
1017
+ {
1018
+ "name": "calendar_schedule_suggestion",
1019
+ "score": 0.14285714285714285,
1020
+ "eval_type": "rule",
1021
+ "num_demo": 1,
1022
+ "num_query": 14
1023
+ },
1024
+ {
1025
+ "name": "geometry_reasoning_overlapped_circle",
1026
+ "score": 0.4642857142857143,
1027
+ "eval_type": "rule",
1028
+ "num_demo": 1,
1029
+ "num_query": 14
1030
+ },
1031
+ {
1032
+ "name": "planning_screenshot_barman",
1033
+ "score": 0.0,
1034
+ "eval_type": "rule",
1035
+ "num_demo": 1,
1036
+ "num_query": 15
1037
+ },
1038
+ {
1039
+ "name": "planning_screenshot_floortile",
1040
+ "score": 0.0,
1041
+ "eval_type": "rule",
1042
+ "num_demo": 1,
1043
+ "num_query": 15
1044
+ },
1045
+ {
1046
+ "name": "medical_blood_vessels_recognition",
1047
+ "score": 0.7142857142857143,
1048
+ "eval_type": "rule",
1049
+ "num_demo": 1,
1050
+ "num_query": 14
1051
+ },
1052
+ {
1053
+ "name": "location_vqa",
1054
+ "score": 0.6428571428571429,
1055
+ "eval_type": "rule",
1056
+ "num_demo": 1,
1057
+ "num_query": 14
1058
+ },
1059
+ {
1060
+ "name": "mindmap_elements_parsing",
1061
+ "score": 0.2857142857142857,
1062
+ "eval_type": "rule",
1063
+ "num_demo": 1,
1064
+ "num_query": 14
1065
+ },
1066
+ {
1067
+ "name": "mensa_iq_test",
1068
+ "score": 0.3803921568627451,
1069
+ "eval_type": "rule",
1070
+ "num_demo": 1,
1071
+ "num_query": 17
1072
+ },
1073
+ {
1074
+ "name": "flowchart_code_generation",
1075
+ "score": 0.7777777777777778,
1076
+ "eval_type": "rule",
1077
+ "num_demo": 1,
1078
+ "num_query": 9
1079
+ },
1080
+ {
1081
+ "name": "stackoverflow_debug_QA",
1082
+ "score": 0.5714285714285714,
1083
+ "eval_type": "rule",
1084
+ "num_demo": 1,
1085
+ "num_query": 14
1086
+ },
1087
+ {
1088
+ "name": "logical_reasoning_find_odd_one_out",
1089
+ "score": 0.03571428571428571,
1090
+ "eval_type": "rule",
1091
+ "num_demo": 1,
1092
+ "num_query": 14
1093
+ },
1094
+ {
1095
+ "name": "web_action_prediction",
1096
+ "score": 0.42857142857142855,
1097
+ "eval_type": "rule",
1098
+ "num_demo": 1,
1099
+ "num_query": 14
1100
+ },
1101
+ {
1102
+ "name": "code_execution",
1103
+ "score": 0.5,
1104
+ "eval_type": "rule",
1105
+ "num_demo": 1,
1106
+ "num_query": 16
1107
+ },
1108
+ {
1109
+ "name": "music_sheet_format_QA",
1110
+ "score": 0.5714285714285714,
1111
+ "eval_type": "rule",
1112
+ "num_demo": 1,
1113
+ "num_query": 14
1114
+ },
1115
+ {
1116
+ "name": "annoying_word_search",
1117
+ "score": 0.0,
1118
+ "eval_type": "rule",
1119
+ "num_demo": 1,
1120
+ "num_query": 14
1121
+ },
1122
+ {
1123
+ "name": "interpret_force_perspective_illusion",
1124
+ "score": 0.7333333333333333,
1125
+ "eval_type": "rule",
1126
+ "num_demo": 1,
1127
+ "num_query": 15
1128
+ },
1129
+ {
1130
+ "name": "healthcare_info_judgement",
1131
+ "score": 0.7857142857142857,
1132
+ "eval_type": "rule",
1133
+ "num_demo": 1,
1134
+ "num_query": 14
1135
+ },
1136
+ {
1137
+ "name": "geometry_plot_position_relationship",
1138
+ "score": 0.7142857142857143,
1139
+ "eval_type": "rule",
1140
+ "num_demo": 1,
1141
+ "num_query": 14
1142
+ },
1143
+ {
1144
+ "name": "relative_depth_of_different_points",
1145
+ "score": 0.5714285714285714,
1146
+ "eval_type": "rule",
1147
+ "num_demo": 1,
1148
+ "num_query": 14
1149
+ },
1150
+ {
1151
+ "name": "topological_sort",
1152
+ "score": 0.0,
1153
+ "eval_type": "rule",
1154
+ "num_demo": 1,
1155
+ "num_query": 14
1156
+ },
1157
+ {
1158
+ "name": "scibench_fundamental_wo_solution",
1159
+ "score": 0.3673469387755102,
1160
+ "eval_type": "rule",
1161
+ "num_demo": 1,
1162
+ "num_query": 49
1163
+ },
1164
+ {
1165
+ "name": "geometry_reasoning_nested_squares",
1166
+ "score": 0.42857142857142855,
1167
+ "eval_type": "rule",
1168
+ "num_demo": 1,
1169
+ "num_query": 14
1170
+ },
1171
+ {
1172
+ "name": "font_recognition",
1173
+ "score": 0.07142857142857142,
1174
+ "eval_type": "rule",
1175
+ "num_demo": 1,
1176
+ "num_query": 14
1177
+ },
1178
+ {
1179
+ "name": "geometry_reasoning_count_line_intersections",
1180
+ "score": 0.4642857142857143,
1181
+ "eval_type": "rule",
1182
+ "num_demo": 1,
1183
+ "num_query": 14
1184
+ },
1185
+ {
1186
+ "name": "circuit_diagram_understanding",
1187
+ "score": 0.06666666666666667,
1188
+ "eval_type": "rule",
1189
+ "num_demo": 1,
1190
+ "num_query": 15
1191
+ },
1192
+ {
1193
+ "name": "go_capture_stone",
1194
+ "score": 0.0,
1195
+ "eval_type": "rule",
1196
+ "num_demo": 1,
1197
+ "num_query": 15
1198
+ },
1199
+ {
1200
+ "name": "monthly_weather_days_count",
1201
+ "score": 0.2857142857142857,
1202
+ "eval_type": "rule",
1203
+ "num_demo": 1,
1204
+ "num_query": 14
1205
+ },
1206
+ {
1207
+ "name": "weather_map_climate_type_temperature_parsing",
1208
+ "score": 0.5714285714285714,
1209
+ "eval_type": "rule",
1210
+ "num_demo": 1,
1211
+ "num_query": 14
1212
+ },
1213
+ {
1214
+ "name": "top_video_creator_identification",
1215
+ "score": 0.0,
1216
+ "eval_type": "rule",
1217
+ "num_demo": 1,
1218
+ "num_query": 14
1219
+ },
1220
+ {
1221
+ "name": "rebus",
1222
+ "score": 0.391304347826087,
1223
+ "eval_type": "rule",
1224
+ "num_demo": 1,
1225
+ "num_query": 23
1226
+ },
1227
+ {
1228
+ "name": "ishihara_test",
1229
+ "score": 0.4857142857142857,
1230
+ "eval_type": "rule",
1231
+ "num_demo": 1,
1232
+ "num_query": 14
1233
+ },
1234
+ {
1235
+ "name": "paper_vqa",
1236
+ "score": 0.21428571428571427,
1237
+ "eval_type": "rule",
1238
+ "num_demo": 1,
1239
+ "num_query": 14
1240
+ },
1241
+ {
1242
+ "name": "signage_navigation",
1243
+ "score": 0.6428571428571429,
1244
+ "eval_type": "rule",
1245
+ "num_demo": 1,
1246
+ "num_query": 14
1247
+ },
1248
+ {
1249
+ "name": "webpage_code_understanding",
1250
+ "score": 0.7777777777777778,
1251
+ "eval_type": "rule",
1252
+ "num_demo": 1,
1253
+ "num_query": 9
1254
+ },
1255
+ {
1256
+ "name": "medical_counting_lymphocytes",
1257
+ "score": 0.0,
1258
+ "eval_type": "rule",
1259
+ "num_demo": 1,
1260
+ "num_query": 14
1261
+ },
1262
+ {
1263
+ "name": "game_platform_support_identification",
1264
+ "score": 0.6428571428571429,
1265
+ "eval_type": "rule",
1266
+ "num_demo": 1,
1267
+ "num_query": 14
1268
+ },
1269
+ {
1270
+ "name": "GUI_Act_Mobile_swipe",
1271
+ "score": 0.519652560877361,
1272
+ "eval_type": "rule",
1273
+ "num_demo": 1,
1274
+ "num_query": 13
1275
+ },
1276
+ {
1277
+ "name": "mahjong",
1278
+ "score": 0.0,
1279
+ "eval_type": "rule",
1280
+ "num_demo": 1,
1281
+ "num_query": 14
1282
+ },
1283
+ {
1284
+ "name": "scibench_calculus_wo_solution",
1285
+ "score": 0.2653061224489796,
1286
+ "eval_type": "rule",
1287
+ "num_demo": 1,
1288
+ "num_query": 49
1289
+ },
1290
+ {
1291
+ "name": "knowledge_graph_understanding",
1292
+ "score": 0.4666666666666667,
1293
+ "eval_type": "rule",
1294
+ "num_demo": 1,
1295
+ "num_query": 15
1296
+ },
1297
+ {
1298
+ "name": "image_translation_en2cn",
1299
+ "score": 0.42609161120798006,
1300
+ "eval_type": "rule",
1301
+ "num_demo": 1,
1302
+ "num_query": 9
1303
+ },
1304
+ {
1305
+ "name": "realworld_qa_en2cn",
1306
+ "score": 0.8571428571428571,
1307
+ "eval_type": "rule",
1308
+ "num_demo": 1,
1309
+ "num_query": 14
1310
+ },
1311
+ {
1312
+ "name": "planning_visual_storage",
1313
+ "score": 0.0,
1314
+ "eval_type": "rule",
1315
+ "num_demo": 1,
1316
+ "num_query": 15
1317
+ },
1318
+ {
1319
+ "name": "GUI_Act_Web_Multi",
1320
+ "score": 0.4711837794298108,
1321
+ "eval_type": "rule",
1322
+ "num_demo": 1,
1323
+ "num_query": 14
1324
+ },
1325
+ {
1326
+ "name": "chinese_idiom_recognition",
1327
+ "score": 0.35714285714285715,
1328
+ "eval_type": "rule",
1329
+ "num_demo": 1,
1330
+ "num_query": 14
1331
+ },
1332
+ {
1333
+ "name": "number_comparison",
1334
+ "score": 0.9285714285714286,
1335
+ "eval_type": "rule",
1336
+ "num_demo": 1,
1337
+ "num_query": 14
1338
+ },
1339
+ {
1340
+ "name": "planning_screenshot_blocksworld",
1341
+ "score": 0.13333333333333333,
1342
+ "eval_type": "rule",
1343
+ "num_demo": 1,
1344
+ "num_query": 15
1345
+ },
1346
+ {
1347
+ "name": "product_ocr_qa",
1348
+ "score": 0.2857142857142857,
1349
+ "eval_type": "rule",
1350
+ "num_demo": 1,
1351
+ "num_query": 14
1352
+ },
1353
+ {
1354
+ "name": "geometry_reasoning_circled_letter",
1355
+ "score": 0.7142857142857143,
1356
+ "eval_type": "rule",
1357
+ "num_demo": 1,
1358
+ "num_query": 14
1359
+ },
1360
+ {
1361
+ "name": "GUI_Act_Web_Single",
1362
+ "score": 0.049588137529181904,
1363
+ "eval_type": "rule",
1364
+ "num_demo": 1,
1365
+ "num_query": 14
1366
+ },
1367
+ {
1368
+ "name": "extract_webpage_headline",
1369
+ "score": 0.35714285714285715,
1370
+ "eval_type": "rule",
1371
+ "num_demo": 1,
1372
+ "num_query": 14
1373
+ },
1374
+ {
1375
+ "name": "planning_screenshot_storage",
1376
+ "score": 0.0,
1377
+ "eval_type": "rule",
1378
+ "num_demo": 1,
1379
+ "num_query": 15
1380
+ },
1381
+ {
1382
+ "name": "soccer_offside",
1383
+ "score": 0.3333333333333333,
1384
+ "eval_type": "rule",
1385
+ "num_demo": 1,
1386
+ "num_query": 9
1387
+ },
1388
+ {
1389
+ "name": "geometry_reasoning_grid",
1390
+ "score": 0.6785714285714286,
1391
+ "eval_type": "rule",
1392
+ "num_demo": 1,
1393
+ "num_query": 14
1394
+ },
1395
+ {
1396
+ "name": "relative_reflectance_of_different_regions",
1397
+ "score": 0.21428571428571427,
1398
+ "eval_type": "rule",
1399
+ "num_demo": 1,
1400
+ "num_query": 14
1401
+ },
1402
+ {
1403
+ "name": "entertainment_web_game_style",
1404
+ "score": 0.75,
1405
+ "eval_type": "rule",
1406
+ "num_demo": 1,
1407
+ "num_query": 14
1408
+ },
1409
+ {
1410
+ "name": "orchestra_score_recognition",
1411
+ "score": 0.10714285714285714,
1412
+ "eval_type": "rule",
1413
+ "num_demo": 1,
1414
+ "num_query": 14
1415
+ },
1416
+ {
1417
+ "name": "icon_arithmetic_puzzle",
1418
+ "score": 0.5714285714285714,
1419
+ "eval_type": "rule",
1420
+ "num_demo": 1,
1421
+ "num_query": 14
1422
+ },
1423
+ {
1424
+ "name": "planning_screenshot_grippers",
1425
+ "score": 0.13333333333333333,
1426
+ "eval_type": "rule",
1427
+ "num_demo": 1,
1428
+ "num_query": 15
1429
+ },
1430
+ {
1431
+ "name": "MMMU_pro_exam_screenshot",
1432
+ "score": 0.24242424242424243,
1433
+ "eval_type": "rule",
1434
+ "num_demo": 1,
1435
+ "num_query": 99
1436
+ },
1437
+ {
1438
+ "name": "clevrer_physics",
1439
+ "score": 0.55,
1440
+ "eval_type": "rule",
1441
+ "num_demo": 1,
1442
+ "num_query": 20
1443
+ },
1444
+ {
1445
+ "name": "MMMU_physics_chemistry_selected",
1446
+ "score": 0.7857142857142857,
1447
+ "eval_type": "rule",
1448
+ "num_demo": 1,
1449
+ "num_query": 14
1450
+ },
1451
+ {
1452
+ "name": "planning_screenshot_tyreworld",
1453
+ "score": 0.06666666666666667,
1454
+ "eval_type": "rule",
1455
+ "num_demo": 1,
1456
+ "num_query": 15
1457
+ },
1458
+ {
1459
+ "name": "music_sheet_note_count",
1460
+ "score": 0.17647058823529413,
1461
+ "eval_type": "rule",
1462
+ "num_demo": 1,
1463
+ "num_query": 17
1464
+ },
1465
+ {
1466
+ "name": "hashtag_recommendation",
1467
+ "score": 0.9226190476190476,
1468
+ "eval_type": "rule",
1469
+ "num_demo": 1,
1470
+ "num_query": 14
1471
+ },
1472
+ {
1473
+ "name": "llavaguard",
1474
+ "score": 0.25,
1475
+ "eval_type": "rule",
1476
+ "num_demo": 1,
1477
+ "num_query": 14
1478
+ },
1479
+ {
1480
+ "name": "medical_multi_organ_segmentation_rater",
1481
+ "score": 0.35714285714285715,
1482
+ "eval_type": "rule",
1483
+ "num_demo": 1,
1484
+ "num_query": 14
1485
+ },
1486
+ {
1487
+ "name": "cultural_vqa",
1488
+ "score": 0.4666666666666667,
1489
+ "eval_type": "rule",
1490
+ "num_demo": 1,
1491
+ "num_query": 15
1492
+ },
1493
+ {
1494
+ "name": "logical_reasoning_fit_pattern",
1495
+ "score": 0.07142857142857142,
1496
+ "eval_type": "rule",
1497
+ "num_demo": 1,
1498
+ "num_query": 14
1499
+ },
1500
+ {
1501
+ "name": "character_recognition_in_TV_shows",
1502
+ "score": 0.7142857142857143,
1503
+ "eval_type": "rule",
1504
+ "num_demo": 1,
1505
+ "num_query": 14
1506
+ },
1507
+ {
1508
+ "name": "highest_discount_game_price_identification",
1509
+ "score": 0.0,
1510
+ "eval_type": "rule",
1511
+ "num_demo": 1,
1512
+ "num_query": 14
1513
+ },
1514
+ {
1515
+ "name": "remaining_playback_time_calculation",
1516
+ "score": 0.0,
1517
+ "eval_type": "rule",
1518
+ "num_demo": 1,
1519
+ "num_query": 14
1520
+ },
1521
+ {
1522
+ "name": "medical_cell_recognition",
1523
+ "score": 0.35714285714285715,
1524
+ "eval_type": "rule",
1525
+ "num_demo": 1,
1526
+ "num_query": 14
1527
+ },
1528
+ {
1529
+ "name": "chess_find_legal_moves",
1530
+ "score": 0.04744105231699729,
1531
+ "eval_type": "rule",
1532
+ "num_demo": 1,
1533
+ "num_query": 14
1534
+ },
1535
+ {
1536
+ "name": "distinguish_ai_generated_image",
1537
+ "score": 0.6842105263157895,
1538
+ "eval_type": "rule",
1539
+ "num_demo": 1,
1540
+ "num_query": 19
1541
+ },
1542
+ {
1543
+ "name": "autonomous_driving_scene_analysis",
1544
+ "score": 0.9285714285714286,
1545
+ "eval_type": "rule",
1546
+ "num_demo": 1,
1547
+ "num_query": 14
1548
+ },
1549
+ {
1550
+ "name": "counting_single_image",
1551
+ "score": 0.8571428571428571,
1552
+ "eval_type": "rule",
1553
+ "num_demo": 1,
1554
+ "num_query": 14
1555
+ },
1556
+ {
1557
+ "name": "GUI_Act_Mobile_tap",
1558
+ "score": 0.2857142857142857,
1559
+ "eval_type": "rule",
1560
+ "num_demo": 1,
1561
+ "num_query": 14
1562
+ },
1563
+ {
1564
+ "name": "road_map_find_highway_between_two_place",
1565
+ "score": 0.5882352941176471,
1566
+ "eval_type": "rule",
1567
+ "num_demo": 1,
1568
+ "num_query": 17
1569
+ },
1570
+ {
1571
+ "name": "chess_sygyzy_endgames",
1572
+ "score": 0.09471861471861472,
1573
+ "eval_type": "rule",
1574
+ "num_demo": 1,
1575
+ "num_query": 14
1576
+ },
1577
+ {
1578
+ "name": "planning_screenshot_termes",
1579
+ "score": 0.0,
1580
+ "eval_type": "rule",
1581
+ "num_demo": 1,
1582
+ "num_query": 15
1583
+ },
1584
+ {
1585
+ "name": "multiple_states_identify_asia",
1586
+ "score": 0.5428571428571428,
1587
+ "eval_type": "rule",
1588
+ "num_demo": 1,
1589
+ "num_query": 14
1590
+ },
1591
+ {
1592
+ "name": "multiple_states_identify_africa",
1593
+ "score": 0.6428571428571429,
1594
+ "eval_type": "rule",
1595
+ "num_demo": 1,
1596
+ "num_query": 14
1597
+ },
1598
+ {
1599
+ "name": "multiple_states_identify_europe",
1600
+ "score": 0.5857142857142857,
1601
+ "eval_type": "rule",
1602
+ "num_demo": 1,
1603
+ "num_query": 14
1604
+ },
1605
+ {
1606
+ "name": "multiple_states_identify_americas",
1607
+ "score": 0.6428571428571429,
1608
+ "eval_type": "rule",
1609
+ "num_demo": 1,
1610
+ "num_query": 14
1611
+ },
1612
+ {
1613
+ "name": "adapted_cvbench_distance",
1614
+ "score": 0.6428571428571429,
1615
+ "eval_type": "rule",
1616
+ "num_demo": 1,
1617
+ "num_query": 14
1618
+ },
1619
+ {
1620
+ "name": "adapted_cvbench_count",
1621
+ "score": 0.42857142857142855,
1622
+ "eval_type": "rule",
1623
+ "num_demo": 1,
1624
+ "num_query": 14
1625
+ },
1626
+ {
1627
+ "name": "adapted_cvbench_depth",
1628
+ "score": 1.0,
1629
+ "eval_type": "rule",
1630
+ "num_demo": 1,
1631
+ "num_query": 14
1632
+ },
1633
+ {
1634
+ "name": "adapted_cvbench_relation",
1635
+ "score": 0.42857142857142855,
1636
+ "eval_type": "rule",
1637
+ "num_demo": 1,
1638
+ "num_query": 14
1639
+ },
1640
+ {
1641
+ "name": "symbolic_graphics_programs_computer_aided_design",
1642
+ "score": 0.35714285714285715,
1643
+ "eval_type": "rule",
1644
+ "num_demo": 1,
1645
+ "num_query": 14
1646
+ },
1647
+ {
1648
+ "name": "symbolic_graphics_programs_scalable_vector_graphics",
1649
+ "score": 0.2777777777777778,
1650
+ "eval_type": "rule",
1651
+ "num_demo": 1,
1652
+ "num_query": 18
1653
+ },
1654
+ {
1655
+ "name": "table_understanding_complex_question_answering",
1656
+ "score": 0.21428571428571427,
1657
+ "eval_type": "rule",
1658
+ "num_demo": 1,
1659
+ "num_query": 14
1660
+ },
1661
+ {
1662
+ "name": "table_understanding_fact_verification",
1663
+ "score": 0.7619047619047618,
1664
+ "eval_type": "rule",
1665
+ "num_demo": 1,
1666
+ "num_query": 14
1667
+ },
1668
+ {
1669
+ "name": "panel_images_multi_question",
1670
+ "score": 0.8095238095238094,
1671
+ "eval_type": "rule",
1672
+ "num_demo": 1,
1673
+ "num_query": 14
1674
+ },
1675
+ {
1676
+ "name": "panel_images_single_question",
1677
+ "score": 0.9285714285714286,
1678
+ "eval_type": "rule",
1679
+ "num_demo": 1,
1680
+ "num_query": 14
1681
+ },
1682
+ {
1683
+ "name": "MMSoc_Misinformation_GossipCop",
1684
+ "score": 0.5714285714285714,
1685
+ "eval_type": "rule",
1686
+ "num_demo": 1,
1687
+ "num_query": 14
1688
+ },
1689
+ {
1690
+ "name": "MMSoc_HatefulMemes",
1691
+ "score": 0.7857142857142857,
1692
+ "eval_type": "rule",
1693
+ "num_demo": 1,
1694
+ "num_query": 14
1695
+ },
1696
+ {
1697
+ "name": "MMSoc_Memotion",
1698
+ "score": 0.6470588235294119,
1699
+ "eval_type": "rule",
1700
+ "num_demo": 1,
1701
+ "num_query": 17
1702
+ },
1703
+ {
1704
+ "name": "MMSoc_Misinformation_PolitiFact",
1705
+ "score": 0.7142857142857143,
1706
+ "eval_type": "rule",
1707
+ "num_demo": 1,
1708
+ "num_query": 14
1709
+ },
1710
+ {
1711
+ "name": "poetry_acrostic_alliteration",
1712
+ "score": 0.0,
1713
+ "eval_type": "rule",
1714
+ "num_demo": 0,
1715
+ "num_query": 15
1716
+ },
1717
+ {
1718
+ "name": "poetry_acrostic",
1719
+ "score": 0.3333333333333333,
1720
+ "eval_type": "rule",
1721
+ "num_demo": 0,
1722
+ "num_query": 15
1723
+ },
1724
+ {
1725
+ "name": "poetry_limerick",
1726
+ "score": 0.7333333333333333,
1727
+ "eval_type": "rule",
1728
+ "num_demo": 0,
1729
+ "num_query": 15
1730
+ },
1731
+ {
1732
+ "name": "poetry_custom_rhyming_scheme",
1733
+ "score": 0.13333333333333333,
1734
+ "eval_type": "rule",
1735
+ "num_demo": 0,
1736
+ "num_query": 15
1737
+ },
1738
+ {
1739
+ "name": "poetry_petrarchian_sonnet_optional_meter",
1740
+ "score": 0.0,
1741
+ "eval_type": "rule",
1742
+ "num_demo": 0,
1743
+ "num_query": 15
1744
+ },
1745
+ {
1746
+ "name": "poetry_haiku",
1747
+ "score": 1.0,
1748
+ "eval_type": "rule",
1749
+ "num_demo": 0,
1750
+ "num_query": 15
1751
+ },
1752
+ {
1753
+ "name": "poetry_shakespearean_sonnet",
1754
+ "score": 0.3333333333333333,
1755
+ "eval_type": "rule",
1756
+ "num_demo": 0,
1757
+ "num_query": 15
1758
+ },
1759
+ {
1760
+ "name": "screenshot_lighteval_math",
1761
+ "score": 0.2,
1762
+ "eval_type": "rule",
1763
+ "num_demo": 1,
1764
+ "num_query": 15
1765
+ },
1766
+ {
1767
+ "name": "screenshot_theoremqa",
1768
+ "score": 0.07142857142857142,
1769
+ "eval_type": "rule",
1770
+ "num_demo": 1,
1771
+ "num_query": 14
1772
+ },
1773
+ {
1774
+ "name": "number_puzzle_sudoku",
1775
+ "score": 0.0,
1776
+ "eval_type": "rule",
1777
+ "num_demo": 1,
1778
+ "num_query": 15
1779
+ },
1780
+ {
1781
+ "name": "number_puzzle_kakuro_5x5",
1782
+ "score": 0.0,
1783
+ "eval_type": "rule",
1784
+ "num_demo": 1,
1785
+ "num_query": 15
1786
+ },
1787
+ {
1788
+ "name": "text_entity_replace",
1789
+ "score": 0.7142857142857143,
1790
+ "eval_type": "rule",
1791
+ "num_demo": 1,
1792
+ "num_query": 14
1793
+ },
1794
+ {
1795
+ "name": "background_change",
1796
+ "score": 0.7857142857142857,
1797
+ "eval_type": "rule",
1798
+ "num_demo": 1,
1799
+ "num_query": 14
1800
+ },
1801
+ {
1802
+ "name": "face_attribute_edit",
1803
+ "score": 0.5,
1804
+ "eval_type": "rule",
1805
+ "num_demo": 1,
1806
+ "num_query": 14
1807
+ },
1808
+ {
1809
+ "name": "face_swap",
1810
+ "score": 0.5714285714285714,
1811
+ "eval_type": "rule",
1812
+ "num_demo": 1,
1813
+ "num_query": 14
1814
+ },
1815
+ {
1816
+ "name": "text_style",
1817
+ "score": 0.6428571428571429,
1818
+ "eval_type": "rule",
1819
+ "num_demo": 1,
1820
+ "num_query": 14
1821
+ },
1822
+ {
1823
+ "name": "out_of_context",
1824
+ "score": 0.9285714285714286,
1825
+ "eval_type": "rule",
1826
+ "num_demo": 1,
1827
+ "num_query": 14
1828
+ },
1829
+ {
1830
+ "name": "clip_stable_diffusion_generate",
1831
+ "score": 0.2857142857142857,
1832
+ "eval_type": "rule",
1833
+ "num_demo": 1,
1834
+ "num_query": 14
1835
+ },
1836
+ {
1837
+ "name": "veracity",
1838
+ "score": 0.8571428571428571,
1839
+ "eval_type": "rule",
1840
+ "num_demo": 1,
1841
+ "num_query": 14
1842
+ },
1843
+ {
1844
+ "name": "counterfactual_arithmetic",
1845
+ "score": 0.6428571428571429,
1846
+ "eval_type": "rule",
1847
+ "num_demo": 1,
1848
+ "num_query": 14
1849
+ },
1850
+ {
1851
+ "name": "maze_2d_8x8",
1852
+ "score": 0.0,
1853
+ "eval_type": "rule",
1854
+ "num_demo": 1,
1855
+ "num_query": 14
1856
+ },
1857
+ {
1858
+ "name": "shape_composition_shapes",
1859
+ "score": 0.40136054421768713,
1860
+ "eval_type": "rule",
1861
+ "num_demo": 1,
1862
+ "num_query": 14
1863
+ },
1864
+ {
1865
+ "name": "shape_composition_colours",
1866
+ "score": 0.4562641723356009,
1867
+ "eval_type": "rule",
1868
+ "num_demo": 1,
1869
+ "num_query": 14
1870
+ },
1871
+ {
1872
+ "name": "autorater_artifact",
1873
+ "score": 0.21428571428571427,
1874
+ "eval_type": "rule",
1875
+ "num_demo": 1,
1876
+ "num_query": 14
1877
+ },
1878
+ {
1879
+ "name": "autorater_artifact_reason",
1880
+ "score": 0.5333333333333333,
1881
+ "eval_type": "rule",
1882
+ "num_demo": 0,
1883
+ "num_query": 15
1884
+ },
1885
+ {
1886
+ "name": "chess_puzzles_crushing",
1887
+ "score": 0.0,
1888
+ "eval_type": "rule",
1889
+ "num_demo": 1,
1890
+ "num_query": 14
1891
+ },
1892
+ {
1893
+ "name": "chess_puzzles_checkmate",
1894
+ "score": 0.0,
1895
+ "eval_type": "rule",
1896
+ "num_demo": 1,
1897
+ "num_query": 14
1898
+ },
1899
+ {
1900
+ "name": "chess_puzzles_equality",
1901
+ "score": 0.0,
1902
+ "eval_type": "rule",
1903
+ "num_demo": 1,
1904
+ "num_query": 15
1905
+ },
1906
+ {
1907
+ "name": "app_layout_understanding_notes",
1908
+ "score": 0.21428571428571427,
1909
+ "eval_type": "rule",
1910
+ "num_demo": 1,
1911
+ "num_query": 14
1912
+ },
1913
+ {
1914
+ "name": "app_layout_understanding_twitter",
1915
+ "score": 0.14285714285714285,
1916
+ "eval_type": "rule",
1917
+ "num_demo": 1,
1918
+ "num_query": 14
1919
+ },
1920
+ {
1921
+ "name": "app_layout_understanding_youtube",
1922
+ "score": 0.14285714285714285,
1923
+ "eval_type": "rule",
1924
+ "num_demo": 1,
1925
+ "num_query": 14
1926
+ },
1927
+ {
1928
+ "name": "app_layout_understanding_tiktok",
1929
+ "score": 0.42857142857142855,
1930
+ "eval_type": "rule",
1931
+ "num_demo": 1,
1932
+ "num_query": 14
1933
+ },
1934
+ {
1935
+ "name": "app_layout_understanding_excel",
1936
+ "score": 0.07142857142857142,
1937
+ "eval_type": "rule",
1938
+ "num_demo": 1,
1939
+ "num_query": 14
1940
+ },
1941
+ {
1942
+ "name": "app_layout_understanding_amazon",
1943
+ "score": 0.07142857142857142,
1944
+ "eval_type": "rule",
1945
+ "num_demo": 1,
1946
+ "num_query": 14
1947
+ },
1948
+ {
1949
+ "name": "app_layout_understanding_instagram",
1950
+ "score": 0.5,
1951
+ "eval_type": "rule",
1952
+ "num_demo": 1,
1953
+ "num_query": 14
1954
+ },
1955
+ {
1956
+ "name": "app_layout_understanding_zoom",
1957
+ "score": 0.3333333333333333,
1958
+ "eval_type": "rule",
1959
+ "num_demo": 1,
1960
+ "num_query": 15
1961
+ },
1962
+ {
1963
+ "name": "app_layout_understanding_word",
1964
+ "score": 0.07142857142857142,
1965
+ "eval_type": "rule",
1966
+ "num_demo": 1,
1967
+ "num_query": 14
1968
+ },
1969
+ {
1970
+ "name": "app_layout_understanding_iphone_settings",
1971
+ "score": 0.6428571428571429,
1972
+ "eval_type": "rule",
1973
+ "num_demo": 1,
1974
+ "num_query": 14
1975
+ },
1976
+ {
1977
+ "name": "app_layout_understanding_leetcode",
1978
+ "score": 0.35714285714285715,
1979
+ "eval_type": "rule",
1980
+ "num_demo": 1,
1981
+ "num_query": 14
1982
+ },
1983
+ {
1984
+ "name": "app_layout_understanding_ppt",
1985
+ "score": 0.14285714285714285,
1986
+ "eval_type": "rule",
1987
+ "num_demo": 1,
1988
+ "num_query": 14
1989
+ },
1990
+ {
1991
+ "name": "app_layout_understanding_alipay",
1992
+ "score": 0.7058823529411765,
1993
+ "eval_type": "rule",
1994
+ "num_demo": 1,
1995
+ "num_query": 17
1996
+ },
1997
+ {
1998
+ "name": "ocr_table_to_markdown",
1999
+ "score": 0.35714285714285715,
2000
+ "eval_type": "rule",
2001
+ "num_demo": 1,
2002
+ "num_query": 14
2003
+ },
2004
+ {
2005
+ "name": "ocr_table_to_latex",
2006
+ "score": 0.6428571428571429,
2007
+ "eval_type": "rule",
2008
+ "num_demo": 1,
2009
+ "num_query": 14
2010
+ },
2011
+ {
2012
+ "name": "ocr_resume_employer_plain",
2013
+ "score": 0.14285714285714285,
2014
+ "eval_type": "rule",
2015
+ "num_demo": 1,
2016
+ "num_query": 14
2017
+ },
2018
+ {
2019
+ "name": "ocr_article_journal",
2020
+ "score": 0.07142857142857142,
2021
+ "eval_type": "rule",
2022
+ "num_demo": 1,
2023
+ "num_query": 14
2024
+ },
2025
+ {
2026
+ "name": "ocr_resume_experience_plain",
2027
+ "score": 0.42857142857142855,
2028
+ "eval_type": "rule",
2029
+ "num_demo": 1,
2030
+ "num_query": 14
2031
+ },
2032
+ {
2033
+ "name": "ocr_math_text_latex",
2034
+ "score": 0.21428571428571427,
2035
+ "eval_type": "rule",
2036
+ "num_demo": 1,
2037
+ "num_query": 14
2038
+ },
2039
+ {
2040
+ "name": "ocr_article_authors",
2041
+ "score": 0.21428571428571427,
2042
+ "eval_type": "rule",
2043
+ "num_demo": 1,
2044
+ "num_query": 14
2045
+ },
2046
+ {
2047
+ "name": "ocr_table_to_csv",
2048
+ "score": 0.5714285714285714,
2049
+ "eval_type": "rule",
2050
+ "num_demo": 1,
2051
+ "num_query": 14
2052
+ },
2053
+ {
2054
+ "name": "ocr_math_equation",
2055
+ "score": 0.35714285714285715,
2056
+ "eval_type": "rule",
2057
+ "num_demo": 1,
2058
+ "num_query": 14
2059
+ },
2060
+ {
2061
+ "name": "ocr_resume_school_plain",
2062
+ "score": 0.2857142857142857,
2063
+ "eval_type": "rule",
2064
+ "num_demo": 1,
2065
+ "num_query": 14
2066
+ },
2067
+ {
2068
+ "name": "ocr_table_to_html",
2069
+ "score": 0.5,
2070
+ "eval_type": "rule",
2071
+ "num_demo": 1,
2072
+ "num_query": 14
2073
+ },
2074
+ {
2075
+ "name": "ocr_resume_skill_plain",
2076
+ "score": 0.21428571428571427,
2077
+ "eval_type": "rule",
2078
+ "num_demo": 1,
2079
+ "num_query": 14
2080
+ },
2081
+ {
2082
+ "name": "crossword_mini_5x5",
2083
+ "score": 0.4285714285714285,
2084
+ "eval_type": "rule",
2085
+ "num_demo": 1,
2086
+ "num_query": 14
2087
+ },
2088
+ {
2089
+ "name": "contain_position_length",
2090
+ "score": 0.6,
2091
+ "eval_type": "rule",
2092
+ "num_demo": 0,
2093
+ "num_query": 15
2094
+ },
2095
+ {
2096
+ "name": "contain_repeat_length",
2097
+ "score": 0.26666666666666666,
2098
+ "eval_type": "rule",
2099
+ "num_demo": 0,
2100
+ "num_query": 15
2101
+ },
2102
+ {
2103
+ "name": "contain_length",
2104
+ "score": 0.8666666666666667,
2105
+ "eval_type": "rule",
2106
+ "num_demo": 0,
2107
+ "num_query": 15
2108
+ },
2109
+ {
2110
+ "name": "contain_contain_length",
2111
+ "score": 1.0,
2112
+ "eval_type": "rule",
2113
+ "num_demo": 0,
2114
+ "num_query": 15
2115
+ },
2116
+ {
2117
+ "name": "pictionary_skribbl_io",
2118
+ "score": 0.2,
2119
+ "eval_type": "rule",
2120
+ "num_demo": 1,
2121
+ "num_query": 20
2122
+ },
2123
+ {
2124
+ "name": "pictionary_doodle_guess",
2125
+ "score": 0.8,
2126
+ "eval_type": "rule",
2127
+ "num_demo": 1,
2128
+ "num_query": 15
2129
+ },
2130
+ {
2131
+ "name": "pictionary_genai_output_chinese",
2132
+ "score": 0.21428571428571427,
2133
+ "eval_type": "rule",
2134
+ "num_demo": 1,
2135
+ "num_query": 14
2136
+ },
2137
+ {
2138
+ "name": "pictionary_cartoon_drawing_guess",
2139
+ "score": 0.8571428571428571,
2140
+ "eval_type": "rule",
2141
+ "num_demo": 1,
2142
+ "num_query": 14
2143
+ },
2144
+ {
2145
+ "name": "pictionary_chinese_food_img2en",
2146
+ "score": 0.5714285714285714,
2147
+ "eval_type": "rule",
2148
+ "num_demo": 1,
2149
+ "num_query": 14
2150
+ },
2151
+ {
2152
+ "name": "reward_models_i2t_reward",
2153
+ "score": 0.35714285714285715,
2154
+ "eval_type": "rule",
2155
+ "num_demo": 1,
2156
+ "num_query": 14
2157
+ },
2158
+ {
2159
+ "name": "memorization_chinese_celebrity",
2160
+ "score": 0.21428571428571427,
2161
+ "eval_type": "rule",
2162
+ "num_demo": 1,
2163
+ "num_query": 14
2164
+ },
2165
+ {
2166
+ "name": "memorization_papers",
2167
+ "score": 0.4666666666666667,
2168
+ "eval_type": "rule",
2169
+ "num_demo": 1,
2170
+ "num_query": 15
2171
+ },
2172
+ {
2173
+ "name": "memorization_famous_treaty",
2174
+ "score": 0.5357142857142857,
2175
+ "eval_type": "rule",
2176
+ "num_demo": 1,
2177
+ "num_query": 14
2178
+ },
2179
+ {
2180
+ "name": "memorization_indian_celebrity",
2181
+ "score": 0.5,
2182
+ "eval_type": "rule",
2183
+ "num_demo": 1,
2184
+ "num_query": 14
2185
+ },
2186
+ {
2187
+ "name": "research_website_parsing_blogpost",
2188
+ "score": 0.0,
2189
+ "eval_type": "rule",
2190
+ "num_demo": 1,
2191
+ "num_query": 14
2192
+ },
2193
+ {
2194
+ "name": "research_website_parsing_publication",
2195
+ "score": 0.14285714285714285,
2196
+ "eval_type": "rule",
2197
+ "num_demo": 1,
2198
+ "num_query": 14
2199
+ },
2200
+ {
2201
+ "name": "research_website_parsing_homepage",
2202
+ "score": 0.21428571428571427,
2203
+ "eval_type": "rule",
2204
+ "num_demo": 1,
2205
+ "num_query": 14
2206
+ }
2207
+ ]
static/eval_results/SI/Idefics3/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/SI/InternVL2_2B/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/SI/InternVL2_76B/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/SI/InternVL2_8B/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/SI/Llama_3_2_11B/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/SI/MiniCPM_v2.6/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/SI/Molmo_72B/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
@@ -7,8 +7,8 @@
7
  "macro_mean_score": 0.36480000609384927,
8
  "missing_tasks": [
9
  "planning_screenshot_termes",
10
- "table_understanding",
11
- "MMSoc_Misinformation_PolitiFact"
12
  ]
13
  },
14
  "open": {
 
7
  "macro_mean_score": 0.36480000609384927,
8
  "missing_tasks": [
9
  "planning_screenshot_termes",
10
+ "MMSoc_Misinformation_PolitiFact",
11
+ "table_understanding"
12
  ]
13
  },
14
  "open": {
static/eval_results/SI/Molmo_72B/task_results.json CHANGED
@@ -1897,14 +1897,14 @@
1897
  "num_query": 0
1898
  },
1899
  {
1900
- "name": "table_understanding",
1901
  "score": 0.0,
1902
  "eval_type": "rule",
1903
  "num_demo": 0,
1904
  "num_query": 0
1905
  },
1906
  {
1907
- "name": "MMSoc_Misinformation_PolitiFact",
1908
  "score": 0.0,
1909
  "eval_type": "rule",
1910
  "num_demo": 0,
 
1897
  "num_query": 0
1898
  },
1899
  {
1900
+ "name": "MMSoc_Misinformation_PolitiFact",
1901
  "score": 0.0,
1902
  "eval_type": "rule",
1903
  "num_demo": 0,
1904
  "num_query": 0
1905
  },
1906
  {
1907
+ "name": "table_understanding",
1908
  "score": 0.0,
1909
  "eval_type": "rule",
1910
  "num_demo": 0,
static/eval_results/SI/Molmo_7B_D/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/SI/NVLM/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/SI/POINTS_15_7B/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/SI/POINTS_7B/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes
static/eval_results/SI/Phi-3.5-vision/{summary_results.json β†’ summary_and_keyword_stats.json} RENAMED
File without changes