cccjc commited on
Commit
f14a657
·
1 Parent(s): b552779

add new model results

Browse files
static/eval_results/all_model_keywords_stats.json CHANGED
@@ -1,4 +1,238 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "GPT_4o_mini": {
3
  "skills": {
4
  "Object Recognition and Classification": {
@@ -167,19 +401,253 @@
167
  "count": 43,
168
  "num_samples": 698,
169
  "tasks": [],
170
- "average_score": 0.45508480503584553
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  },
172
  "4-5 images": {
173
  "count": 34,
174
  "num_samples": 520,
175
  "tasks": [],
176
- "average_score": 0.24651576711552803
177
  },
178
  "2-3 images": {
179
  "count": 51,
180
  "num_samples": 802,
181
  "tasks": [],
182
- "average_score": 0.3697506340557095
183
  }
184
  },
185
  "app": {
@@ -187,113 +655,113 @@
187
  "count": 72,
188
  "num_samples": 1124,
189
  "tasks": [],
190
- "average_score": 0.5640948591986592
191
  },
192
  "Planning": {
193
  "count": 78,
194
  "num_samples": 1239,
195
  "tasks": [],
196
- "average_score": 0.2420320329702607
197
  },
198
  "Coding": {
199
  "count": 31,
200
  "num_samples": 474,
201
  "tasks": [],
202
- "average_score": 0.3458483931206892
203
  },
204
  "Perception": {
205
  "count": 145,
206
  "num_samples": 2313,
207
  "tasks": [],
208
- "average_score": 0.43544861040322835
209
  },
210
  "Metrics": {
211
  "count": 20,
212
  "num_samples": 309,
213
  "tasks": [],
214
- "average_score": 0.5176671720617656
215
  },
216
  "Science": {
217
  "count": 29,
218
  "num_samples": 574,
219
  "tasks": [],
220
- "average_score": 0.3554299482098288
221
  },
222
  "Knowledge": {
223
  "count": 97,
224
  "num_samples": 1605,
225
  "tasks": [],
226
- "average_score": 0.5398829253460956
227
  },
228
  "Mathematics": {
229
  "count": 33,
230
  "num_samples": 547,
231
  "tasks": [],
232
- "average_score": 0.32918280841495845
233
  }
234
  }
235
  },
236
- "Llama_3_2_11B": {
237
  "skills": {
238
  "Object Recognition and Classification": {
239
  "count": 303,
240
  "num_samples": 4755,
241
  "tasks": [],
242
- "average_score": 0.1907604552173455
243
  },
244
  "Text Recognition (OCR)": {
245
  "count": 137,
246
  "num_samples": 2239,
247
  "tasks": [],
248
- "average_score": 0.14280015951776653
249
  },
250
  "Language Understanding and Generation": {
251
  "count": 154,
252
  "num_samples": 2509,
253
  "tasks": [],
254
- "average_score": 0.1960311445935766
255
  },
256
  "Scene and Event Understanding": {
257
  "count": 154,
258
  "num_samples": 2467,
259
  "tasks": [],
260
- "average_score": 0.22399113135844315
261
  },
262
  "Mathematical and Logical Reasoning": {
263
  "count": 109,
264
  "num_samples": 1910,
265
  "tasks": [],
266
- "average_score": 0.13303760019716085
267
  },
268
  "Commonsense and Social Reasoning": {
269
  "count": 51,
270
  "num_samples": 855,
271
  "tasks": [],
272
- "average_score": 0.323153603297999
273
  },
274
  "Ethical and Safety Reasoning": {
275
  "count": 15,
276
  "num_samples": 245,
277
  "tasks": [],
278
- "average_score": 0.4260501253132832
279
  },
280
  "Domain-Specific Knowledge and Skills": {
281
  "count": 77,
282
  "num_samples": 1386,
283
  "tasks": [],
284
- "average_score": 0.1770852858056774
285
  },
286
  "Spatial and Temporal Reasoning": {
287
  "count": 152,
288
  "num_samples": 2437,
289
  "tasks": [],
290
- "average_score": 0.15366454315378308
291
  },
292
  "Planning and Decision Making": {
293
  "count": 37,
294
  "num_samples": 577,
295
  "tasks": [],
296
- "average_score": 0.06563884729522687
297
  }
298
  },
299
  "input_format": {
@@ -301,43 +769,43 @@
301
  "count": 93,
302
  "num_samples": 1517,
303
  "tasks": [],
304
- "average_score": 0.11886347847341794
305
  },
306
  "Text-Based Images and Documents": {
307
  "count": 82,
308
  "num_samples": 1294,
309
  "tasks": [],
310
- "average_score": 0.11489351406848371
311
  },
312
  "Diagrams and Data Visualizations": {
313
  "count": 101,
314
  "num_samples": 1718,
315
  "tasks": [],
316
- "average_score": 0.1693681214060816
317
  },
318
  "Videos": {
319
  "count": 43,
320
  "num_samples": 698,
321
  "tasks": [],
322
- "average_score": 0.2123769209846321
323
  },
324
  "Artistic and Creative Content": {
325
  "count": 32,
326
  "num_samples": 541,
327
  "tasks": [],
328
- "average_score": 0.2520175802062012
329
  },
330
  "Photographs": {
331
  "count": 143,
332
  "num_samples": 2248,
333
  "tasks": [],
334
- "average_score": 0.24806929522702081
335
  },
336
  "3D Models and Aerial Imagery": {
337
  "count": 11,
338
  "num_samples": 169,
339
  "tasks": [],
340
- "average_score": 0.06418655520777307
341
  }
342
  },
343
  "output_format": {
@@ -345,37 +813,37 @@
345
  "count": 98,
346
  "num_samples": 1514,
347
  "tasks": [],
348
- "average_score": 0.12349256529641485
349
  },
350
  "structured_output": {
351
  "count": 110,
352
  "num_samples": 1714,
353
  "tasks": [],
354
- "average_score": 0.16374180545556977
355
  },
356
  "exact_text": {
357
  "count": 83,
358
  "num_samples": 1278,
359
  "tasks": [],
360
- "average_score": 0.1576236804437753
361
  },
362
  "numerical_data": {
363
  "count": 49,
364
  "num_samples": 862,
365
  "tasks": [],
366
- "average_score": 0.15014439824913947
367
  },
368
  "open_ended_output": {
369
  "count": 80,
370
  "num_samples": 1454,
371
  "tasks": [],
372
- "average_score": 0.3003142292328822
373
  },
374
  "multiple_choice": {
375
  "count": 85,
376
  "num_samples": 1363,
377
  "tasks": [],
378
- "average_score": 0.19270157739425633
379
  }
380
  },
381
  "input_num": {
@@ -383,37 +851,37 @@
383
  "count": 21,
384
  "num_samples": 314,
385
  "tasks": [],
386
- "average_score": 0.1463246409674981
387
  },
388
  "9-image or more": {
389
  "count": 41,
390
  "num_samples": 623,
391
  "tasks": [],
392
- "average_score": 0.0732004839476103
393
  },
394
  "1-image": {
395
  "count": 315,
396
  "num_samples": 5228,
397
  "tasks": [],
398
- "average_score": 0.19579907898674231
399
  },
400
  "video": {
401
  "count": 43,
402
  "num_samples": 698,
403
  "tasks": [],
404
- "average_score": 0.2123769209846321
405
  },
406
  "4-5 images": {
407
  "count": 34,
408
  "num_samples": 520,
409
  "tasks": [],
410
- "average_score": 0.1351857051327849
411
  },
412
  "2-3 images": {
413
  "count": 51,
414
  "num_samples": 802,
415
  "tasks": [],
416
- "average_score": 0.18586695387250338
417
  }
418
  },
419
  "app": {
@@ -421,49 +889,49 @@
421
  "count": 72,
422
  "num_samples": 1124,
423
  "tasks": [],
424
- "average_score": 0.17288724679416761
425
  },
426
  "Planning": {
427
  "count": 78,
428
  "num_samples": 1239,
429
  "tasks": [],
430
- "average_score": 0.08100042975820579
431
  },
432
  "Coding": {
433
  "count": 31,
434
  "num_samples": 474,
435
  "tasks": [],
436
- "average_score": 0.0575426944971537
437
  },
438
  "Perception": {
439
  "count": 145,
440
  "num_samples": 2313,
441
  "tasks": [],
442
- "average_score": 0.19853488174071646
443
  },
444
  "Metrics": {
445
  "count": 20,
446
  "num_samples": 309,
447
  "tasks": [],
448
- "average_score": 0.254316961351997
449
  },
450
  "Science": {
451
  "count": 29,
452
  "num_samples": 574,
453
  "tasks": [],
454
- "average_score": 0.162801811963855
455
  },
456
  "Knowledge": {
457
  "count": 97,
458
  "num_samples": 1605,
459
  "tasks": [],
460
- "average_score": 0.28055776664538923
461
  },
462
  "Mathematics": {
463
  "count": 33,
464
  "num_samples": 547,
465
  "tasks": [],
466
- "average_score": 0.13937853323074623
467
  }
468
  }
469
  },
@@ -1181,13 +1649,13 @@
1181
  "count": 137,
1182
  "num_samples": 2239,
1183
  "tasks": [],
1184
- "average_score": 0.4989864259016192
1185
  },
1186
  "Language Understanding and Generation": {
1187
  "count": 154,
1188
  "num_samples": 2509,
1189
  "tasks": [],
1190
- "average_score": 0.550842111088751
1191
  },
1192
  "Scene and Event Understanding": {
1193
  "count": 154,
@@ -1267,7 +1735,7 @@
1267
  "count": 143,
1268
  "num_samples": 2248,
1269
  "tasks": [],
1270
- "average_score": 0.5495643443147615
1271
  },
1272
  "3D Models and Aerial Imagery": {
1273
  "count": 11,
@@ -1281,7 +1749,7 @@
1281
  "count": 98,
1282
  "num_samples": 1514,
1283
  "tasks": [],
1284
- "average_score": 0.44828282747008336
1285
  },
1286
  "structured_output": {
1287
  "count": 110,
@@ -1331,7 +1799,7 @@
1331
  "count": 315,
1332
  "num_samples": 5228,
1333
  "tasks": [],
1334
- "average_score": 0.5032283218366624
1335
  },
1336
  "video": {
1337
  "count": 43,
@@ -1375,7 +1843,7 @@
1375
  "count": 145,
1376
  "num_samples": 2313,
1377
  "tasks": [],
1378
- "average_score": 0.524603412718188
1379
  },
1380
  "Metrics": {
1381
  "count": 20,
@@ -1643,25 +2111,25 @@
1643
  "count": 303,
1644
  "num_samples": 4755,
1645
  "tasks": [],
1646
- "average_score": 0.5628292541089482
1647
  },
1648
  "Text Recognition (OCR)": {
1649
  "count": 137,
1650
  "num_samples": 2239,
1651
  "tasks": [],
1652
- "average_score": 0.6173690896799526
1653
  },
1654
  "Language Understanding and Generation": {
1655
  "count": 154,
1656
  "num_samples": 2509,
1657
  "tasks": [],
1658
- "average_score": 0.6122177959113034
1659
  },
1660
  "Scene and Event Understanding": {
1661
  "count": 154,
1662
  "num_samples": 2467,
1663
  "tasks": [],
1664
- "average_score": 0.5822888182775097
1665
  },
1666
  "Mathematical and Logical Reasoning": {
1667
  "count": 109,
@@ -1673,7 +2141,7 @@
1673
  "count": 51,
1674
  "num_samples": 855,
1675
  "tasks": [],
1676
- "average_score": 0.6344814691282928
1677
  },
1678
  "Ethical and Safety Reasoning": {
1679
  "count": 15,
@@ -1705,7 +2173,7 @@
1705
  "count": 93,
1706
  "num_samples": 1517,
1707
  "tasks": [],
1708
- "average_score": 0.6046575685772053
1709
  },
1710
  "Text-Based Images and Documents": {
1711
  "count": 82,
@@ -1735,7 +2203,7 @@
1735
  "count": 143,
1736
  "num_samples": 2248,
1737
  "tasks": [],
1738
- "average_score": 0.559466820210236
1739
  },
1740
  "3D Models and Aerial Imagery": {
1741
  "count": 11,
@@ -1749,13 +2217,13 @@
1749
  "count": 98,
1750
  "num_samples": 1514,
1751
  "tasks": [],
1752
- "average_score": 0.5354190939719853
1753
  },
1754
  "structured_output": {
1755
  "count": 110,
1756
  "num_samples": 1714,
1757
  "tasks": [],
1758
- "average_score": 0.4780999465727382
1759
  },
1760
  "exact_text": {
1761
  "count": 83,
@@ -1793,13 +2261,13 @@
1793
  "count": 41,
1794
  "num_samples": 623,
1795
  "tasks": [],
1796
- "average_score": 0.5265640970967286
1797
  },
1798
  "1-image": {
1799
  "count": 315,
1800
  "num_samples": 5228,
1801
  "tasks": [],
1802
- "average_score": 0.5664191419997976
1803
  },
1804
  "video": {
1805
  "count": 43,
@@ -1817,7 +2285,7 @@
1817
  "count": 51,
1818
  "num_samples": 802,
1819
  "tasks": [],
1820
- "average_score": 0.490800991115688
1821
  }
1822
  },
1823
  "app": {
@@ -1825,7 +2293,7 @@
1825
  "count": 72,
1826
  "num_samples": 1124,
1827
  "tasks": [],
1828
- "average_score": 0.7011776751799048
1829
  },
1830
  "Planning": {
1831
  "count": 78,
@@ -1843,7 +2311,7 @@
1843
  "count": 145,
1844
  "num_samples": 2313,
1845
  "tasks": [],
1846
- "average_score": 0.5491960044393517
1847
  },
1848
  "Metrics": {
1849
  "count": 20,
@@ -1861,7 +2329,7 @@
1861
  "count": 97,
1862
  "num_samples": 1605,
1863
  "tasks": [],
1864
- "average_score": 0.6135384893140922
1865
  },
1866
  "Mathematics": {
1867
  "count": 33,
@@ -2807,6 +3275,240 @@
2807
  }
2808
  }
2809
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2810
  "Claude_3.5": {
2811
  "skills": {
2812
  "Object Recognition and Classification": {
@@ -2819,13 +3521,13 @@
2819
  "count": 137,
2820
  "num_samples": 2239,
2821
  "tasks": [],
2822
- "average_score": 0.6026892335040172
2823
  },
2824
  "Language Understanding and Generation": {
2825
  "count": 154,
2826
  "num_samples": 2509,
2827
  "tasks": [],
2828
- "average_score": 0.5695311134746034
2829
  },
2830
  "Scene and Event Understanding": {
2831
  "count": 154,
@@ -2905,7 +3607,7 @@
2905
  "count": 143,
2906
  "num_samples": 2248,
2907
  "tasks": [],
2908
- "average_score": 0.5078124682977725
2909
  },
2910
  "3D Models and Aerial Imagery": {
2911
  "count": 11,
@@ -2919,7 +3621,7 @@
2919
  "count": 98,
2920
  "num_samples": 1514,
2921
  "tasks": [],
2922
- "average_score": 0.5039586533964282
2923
  },
2924
  "structured_output": {
2925
  "count": 110,
@@ -2969,7 +3671,7 @@
2969
  "count": 315,
2970
  "num_samples": 5228,
2971
  "tasks": [],
2972
- "average_score": 0.5364554303845326
2973
  },
2974
  "video": {
2975
  "count": 43,
@@ -3013,7 +3715,7 @@
3013
  "count": 145,
3014
  "num_samples": 2313,
3015
  "tasks": [],
3016
- "average_score": 0.5304907166726288
3017
  },
3018
  "Metrics": {
3019
  "count": 20,
 
1
  {
2
+ "NVLM": {
3
+ "skills": {
4
+ "Object Recognition and Classification": {
5
+ "count": 303,
6
+ "num_samples": 4755,
7
+ "tasks": [],
8
+ "average_score": 0.24033557047857043
9
+ },
10
+ "Text Recognition (OCR)": {
11
+ "count": 137,
12
+ "num_samples": 2239,
13
+ "tasks": [],
14
+ "average_score": 0.32154059695494047
15
+ },
16
+ "Language Understanding and Generation": {
17
+ "count": 154,
18
+ "num_samples": 2509,
19
+ "tasks": [],
20
+ "average_score": 0.2937052996171993
21
+ },
22
+ "Scene and Event Understanding": {
23
+ "count": 154,
24
+ "num_samples": 2467,
25
+ "tasks": [],
26
+ "average_score": 0.22845955700594492
27
+ },
28
+ "Mathematical and Logical Reasoning": {
29
+ "count": 109,
30
+ "num_samples": 1910,
31
+ "tasks": [],
32
+ "average_score": 0.2639741933075709
33
+ },
34
+ "Commonsense and Social Reasoning": {
35
+ "count": 51,
36
+ "num_samples": 855,
37
+ "tasks": [],
38
+ "average_score": 0.40870864071047447
39
+ },
40
+ "Ethical and Safety Reasoning": {
41
+ "count": 15,
42
+ "num_samples": 245,
43
+ "tasks": [],
44
+ "average_score": 0.4555238095238095
45
+ },
46
+ "Domain-Specific Knowledge and Skills": {
47
+ "count": 77,
48
+ "num_samples": 1386,
49
+ "tasks": [],
50
+ "average_score": 0.25785191641267197
51
+ },
52
+ "Spatial and Temporal Reasoning": {
53
+ "count": 152,
54
+ "num_samples": 2437,
55
+ "tasks": [],
56
+ "average_score": 0.15679681195908274
57
+ },
58
+ "Planning and Decision Making": {
59
+ "count": 37,
60
+ "num_samples": 577,
61
+ "tasks": [],
62
+ "average_score": 0.0672259242345112
63
+ }
64
+ },
65
+ "input_format": {
66
+ "User Interface Screenshots": {
67
+ "count": 93,
68
+ "num_samples": 1517,
69
+ "tasks": [],
70
+ "average_score": 0.23922823287047076
71
+ },
72
+ "Text-Based Images and Documents": {
73
+ "count": 82,
74
+ "num_samples": 1294,
75
+ "tasks": [],
76
+ "average_score": 0.21734036617042948
77
+ },
78
+ "Diagrams and Data Visualizations": {
79
+ "count": 101,
80
+ "num_samples": 1718,
81
+ "tasks": [],
82
+ "average_score": 0.30313485498585124
83
+ },
84
+ "Videos": {
85
+ "count": 43,
86
+ "num_samples": 698,
87
+ "tasks": [],
88
+ "average_score": 0.0
89
+ },
90
+ "Artistic and Creative Content": {
91
+ "count": 32,
92
+ "num_samples": 541,
93
+ "tasks": [],
94
+ "average_score": 0.34726189956094355
95
+ },
96
+ "Photographs": {
97
+ "count": 143,
98
+ "num_samples": 2248,
99
+ "tasks": [],
100
+ "average_score": 0.3264757655296162
101
+ },
102
+ "3D Models and Aerial Imagery": {
103
+ "count": 11,
104
+ "num_samples": 169,
105
+ "tasks": [],
106
+ "average_score": 0.056894830390305184
107
+ }
108
+ },
109
+ "output_format": {
110
+ "contextual_formatted_text": {
111
+ "count": 98,
112
+ "num_samples": 1514,
113
+ "tasks": [],
114
+ "average_score": 0.22868389095927066
115
+ },
116
+ "structured_output": {
117
+ "count": 110,
118
+ "num_samples": 1714,
119
+ "tasks": [],
120
+ "average_score": 0.2788963949121424
121
+ },
122
+ "exact_text": {
123
+ "count": 83,
124
+ "num_samples": 1278,
125
+ "tasks": [],
126
+ "average_score": 0.2787764976961992
127
+ },
128
+ "numerical_data": {
129
+ "count": 49,
130
+ "num_samples": 862,
131
+ "tasks": [],
132
+ "average_score": 0.23349712171444964
133
+ },
134
+ "open_ended_output": {
135
+ "count": 80,
136
+ "num_samples": 1454,
137
+ "tasks": [],
138
+ "average_score": 0.3215948035793096
139
+ },
140
+ "multiple_choice": {
141
+ "count": 85,
142
+ "num_samples": 1363,
143
+ "tasks": [],
144
+ "average_score": 0.18487055428231897
145
+ }
146
+ },
147
+ "input_num": {
148
+ "6-8 images": {
149
+ "count": 21,
150
+ "num_samples": 314,
151
+ "tasks": [],
152
+ "average_score": 0.0
153
+ },
154
+ "9-image or more": {
155
+ "count": 41,
156
+ "num_samples": 623,
157
+ "tasks": [],
158
+ "average_score": 0.0
159
+ },
160
+ "1-image": {
161
+ "count": 315,
162
+ "num_samples": 5228,
163
+ "tasks": [],
164
+ "average_score": 0.3680809151131777
165
+ },
166
+ "video": {
167
+ "count": 43,
168
+ "num_samples": 698,
169
+ "tasks": [],
170
+ "average_score": 0.0
171
+ },
172
+ "4-5 images": {
173
+ "count": 34,
174
+ "num_samples": 520,
175
+ "tasks": [],
176
+ "average_score": 0.03838410364145658
177
+ },
178
+ "2-3 images": {
179
+ "count": 51,
180
+ "num_samples": 802,
181
+ "tasks": [],
182
+ "average_score": 0.2325581694709435
183
+ }
184
+ },
185
+ "app": {
186
+ "Information_Extraction": {
187
+ "count": 72,
188
+ "num_samples": 1124,
189
+ "tasks": [],
190
+ "average_score": 0.22773778915303383
191
+ },
192
+ "Planning": {
193
+ "count": 78,
194
+ "num_samples": 1239,
195
+ "tasks": [],
196
+ "average_score": 0.08048160660797504
197
+ },
198
+ "Coding": {
199
+ "count": 31,
200
+ "num_samples": 474,
201
+ "tasks": [],
202
+ "average_score": 0.2390024647851972
203
+ },
204
+ "Perception": {
205
+ "count": 145,
206
+ "num_samples": 2313,
207
+ "tasks": [],
208
+ "average_score": 0.30211261814126533
209
+ },
210
+ "Metrics": {
211
+ "count": 20,
212
+ "num_samples": 309,
213
+ "tasks": [],
214
+ "average_score": 0.18857142857142856
215
+ },
216
+ "Science": {
217
+ "count": 29,
218
+ "num_samples": 574,
219
+ "tasks": [],
220
+ "average_score": 0.24908307640275493
221
+ },
222
+ "Knowledge": {
223
+ "count": 97,
224
+ "num_samples": 1605,
225
+ "tasks": [],
226
+ "average_score": 0.3724877947012685
227
+ },
228
+ "Mathematics": {
229
+ "count": 33,
230
+ "num_samples": 547,
231
+ "tasks": [],
232
+ "average_score": 0.24529601154794037
233
+ }
234
+ }
235
+ },
236
  "GPT_4o_mini": {
237
  "skills": {
238
  "Object Recognition and Classification": {
 
401
  "count": 43,
402
  "num_samples": 698,
403
  "tasks": [],
404
+ "average_score": 0.45508480503584553
405
+ },
406
+ "4-5 images": {
407
+ "count": 34,
408
+ "num_samples": 520,
409
+ "tasks": [],
410
+ "average_score": 0.24651576711552803
411
+ },
412
+ "2-3 images": {
413
+ "count": 51,
414
+ "num_samples": 802,
415
+ "tasks": [],
416
+ "average_score": 0.3697506340557095
417
+ }
418
+ },
419
+ "app": {
420
+ "Information_Extraction": {
421
+ "count": 72,
422
+ "num_samples": 1124,
423
+ "tasks": [],
424
+ "average_score": 0.5640948591986592
425
+ },
426
+ "Planning": {
427
+ "count": 78,
428
+ "num_samples": 1239,
429
+ "tasks": [],
430
+ "average_score": 0.2420320329702607
431
+ },
432
+ "Coding": {
433
+ "count": 31,
434
+ "num_samples": 474,
435
+ "tasks": [],
436
+ "average_score": 0.3458483931206892
437
+ },
438
+ "Perception": {
439
+ "count": 145,
440
+ "num_samples": 2313,
441
+ "tasks": [],
442
+ "average_score": 0.43544861040322835
443
+ },
444
+ "Metrics": {
445
+ "count": 20,
446
+ "num_samples": 309,
447
+ "tasks": [],
448
+ "average_score": 0.5176671720617656
449
+ },
450
+ "Science": {
451
+ "count": 29,
452
+ "num_samples": 574,
453
+ "tasks": [],
454
+ "average_score": 0.3554299482098288
455
+ },
456
+ "Knowledge": {
457
+ "count": 97,
458
+ "num_samples": 1605,
459
+ "tasks": [],
460
+ "average_score": 0.5398829253460956
461
+ },
462
+ "Mathematics": {
463
+ "count": 33,
464
+ "num_samples": 547,
465
+ "tasks": [],
466
+ "average_score": 0.32918280841495845
467
+ }
468
+ }
469
+ },
470
+ "Llama_3_2_11B": {
471
+ "skills": {
472
+ "Object Recognition and Classification": {
473
+ "count": 303,
474
+ "num_samples": 4755,
475
+ "tasks": [],
476
+ "average_score": 0.1907604552173455
477
+ },
478
+ "Text Recognition (OCR)": {
479
+ "count": 137,
480
+ "num_samples": 2239,
481
+ "tasks": [],
482
+ "average_score": 0.14280015951776653
483
+ },
484
+ "Language Understanding and Generation": {
485
+ "count": 154,
486
+ "num_samples": 2509,
487
+ "tasks": [],
488
+ "average_score": 0.1960311445935766
489
+ },
490
+ "Scene and Event Understanding": {
491
+ "count": 154,
492
+ "num_samples": 2467,
493
+ "tasks": [],
494
+ "average_score": 0.22399113135844315
495
+ },
496
+ "Mathematical and Logical Reasoning": {
497
+ "count": 109,
498
+ "num_samples": 1910,
499
+ "tasks": [],
500
+ "average_score": 0.13303760019716085
501
+ },
502
+ "Commonsense and Social Reasoning": {
503
+ "count": 51,
504
+ "num_samples": 855,
505
+ "tasks": [],
506
+ "average_score": 0.323153603297999
507
+ },
508
+ "Ethical and Safety Reasoning": {
509
+ "count": 15,
510
+ "num_samples": 245,
511
+ "tasks": [],
512
+ "average_score": 0.4260501253132832
513
+ },
514
+ "Domain-Specific Knowledge and Skills": {
515
+ "count": 77,
516
+ "num_samples": 1386,
517
+ "tasks": [],
518
+ "average_score": 0.1770852858056774
519
+ },
520
+ "Spatial and Temporal Reasoning": {
521
+ "count": 152,
522
+ "num_samples": 2437,
523
+ "tasks": [],
524
+ "average_score": 0.15366454315378308
525
+ },
526
+ "Planning and Decision Making": {
527
+ "count": 37,
528
+ "num_samples": 577,
529
+ "tasks": [],
530
+ "average_score": 0.06563884729522687
531
+ }
532
+ },
533
+ "input_format": {
534
+ "User Interface Screenshots": {
535
+ "count": 93,
536
+ "num_samples": 1517,
537
+ "tasks": [],
538
+ "average_score": 0.11886347847341794
539
+ },
540
+ "Text-Based Images and Documents": {
541
+ "count": 82,
542
+ "num_samples": 1294,
543
+ "tasks": [],
544
+ "average_score": 0.11489351406848371
545
+ },
546
+ "Diagrams and Data Visualizations": {
547
+ "count": 101,
548
+ "num_samples": 1718,
549
+ "tasks": [],
550
+ "average_score": 0.1693681214060816
551
+ },
552
+ "Videos": {
553
+ "count": 43,
554
+ "num_samples": 698,
555
+ "tasks": [],
556
+ "average_score": 0.2123769209846321
557
+ },
558
+ "Artistic and Creative Content": {
559
+ "count": 32,
560
+ "num_samples": 541,
561
+ "tasks": [],
562
+ "average_score": 0.2520175802062012
563
+ },
564
+ "Photographs": {
565
+ "count": 143,
566
+ "num_samples": 2248,
567
+ "tasks": [],
568
+ "average_score": 0.24806929522702081
569
+ },
570
+ "3D Models and Aerial Imagery": {
571
+ "count": 11,
572
+ "num_samples": 169,
573
+ "tasks": [],
574
+ "average_score": 0.06418655520777307
575
+ }
576
+ },
577
+ "output_format": {
578
+ "contextual_formatted_text": {
579
+ "count": 98,
580
+ "num_samples": 1514,
581
+ "tasks": [],
582
+ "average_score": 0.12349256529641485
583
+ },
584
+ "structured_output": {
585
+ "count": 110,
586
+ "num_samples": 1714,
587
+ "tasks": [],
588
+ "average_score": 0.16374180545556977
589
+ },
590
+ "exact_text": {
591
+ "count": 83,
592
+ "num_samples": 1278,
593
+ "tasks": [],
594
+ "average_score": 0.1576236804437753
595
+ },
596
+ "numerical_data": {
597
+ "count": 49,
598
+ "num_samples": 862,
599
+ "tasks": [],
600
+ "average_score": 0.15014439824913947
601
+ },
602
+ "open_ended_output": {
603
+ "count": 80,
604
+ "num_samples": 1454,
605
+ "tasks": [],
606
+ "average_score": 0.3003142292328822
607
+ },
608
+ "multiple_choice": {
609
+ "count": 85,
610
+ "num_samples": 1363,
611
+ "tasks": [],
612
+ "average_score": 0.19270157739425633
613
+ }
614
+ },
615
+ "input_num": {
616
+ "6-8 images": {
617
+ "count": 21,
618
+ "num_samples": 314,
619
+ "tasks": [],
620
+ "average_score": 0.1463246409674981
621
+ },
622
+ "9-image or more": {
623
+ "count": 41,
624
+ "num_samples": 623,
625
+ "tasks": [],
626
+ "average_score": 0.0732004839476103
627
+ },
628
+ "1-image": {
629
+ "count": 315,
630
+ "num_samples": 5228,
631
+ "tasks": [],
632
+ "average_score": 0.19579907898674231
633
+ },
634
+ "video": {
635
+ "count": 43,
636
+ "num_samples": 698,
637
+ "tasks": [],
638
+ "average_score": 0.2123769209846321
639
  },
640
  "4-5 images": {
641
  "count": 34,
642
  "num_samples": 520,
643
  "tasks": [],
644
+ "average_score": 0.1351857051327849
645
  },
646
  "2-3 images": {
647
  "count": 51,
648
  "num_samples": 802,
649
  "tasks": [],
650
+ "average_score": 0.18586695387250338
651
  }
652
  },
653
  "app": {
 
655
  "count": 72,
656
  "num_samples": 1124,
657
  "tasks": [],
658
+ "average_score": 0.17288724679416761
659
  },
660
  "Planning": {
661
  "count": 78,
662
  "num_samples": 1239,
663
  "tasks": [],
664
+ "average_score": 0.08100042975820579
665
  },
666
  "Coding": {
667
  "count": 31,
668
  "num_samples": 474,
669
  "tasks": [],
670
+ "average_score": 0.0575426944971537
671
  },
672
  "Perception": {
673
  "count": 145,
674
  "num_samples": 2313,
675
  "tasks": [],
676
+ "average_score": 0.19853488174071646
677
  },
678
  "Metrics": {
679
  "count": 20,
680
  "num_samples": 309,
681
  "tasks": [],
682
+ "average_score": 0.254316961351997
683
  },
684
  "Science": {
685
  "count": 29,
686
  "num_samples": 574,
687
  "tasks": [],
688
+ "average_score": 0.162801811963855
689
  },
690
  "Knowledge": {
691
  "count": 97,
692
  "num_samples": 1605,
693
  "tasks": [],
694
+ "average_score": 0.28055776664538923
695
  },
696
  "Mathematics": {
697
  "count": 33,
698
  "num_samples": 547,
699
  "tasks": [],
700
+ "average_score": 0.13937853323074623
701
  }
702
  }
703
  },
704
+ "Claude_3.5_new": {
705
  "skills": {
706
  "Object Recognition and Classification": {
707
  "count": 303,
708
  "num_samples": 4755,
709
  "tasks": [],
710
+ "average_score": 0.5690042283891658
711
  },
712
  "Text Recognition (OCR)": {
713
  "count": 137,
714
  "num_samples": 2239,
715
  "tasks": [],
716
+ "average_score": 0.6220681231036606
717
  },
718
  "Language Understanding and Generation": {
719
  "count": 154,
720
  "num_samples": 2509,
721
  "tasks": [],
722
+ "average_score": 0.6077980666415158
723
  },
724
  "Scene and Event Understanding": {
725
  "count": 154,
726
  "num_samples": 2467,
727
  "tasks": [],
728
+ "average_score": 0.5511434932168607
729
  },
730
  "Mathematical and Logical Reasoning": {
731
  "count": 109,
732
  "num_samples": 1910,
733
  "tasks": [],
734
+ "average_score": 0.4885536652013625
735
  },
736
  "Commonsense and Social Reasoning": {
737
  "count": 51,
738
  "num_samples": 855,
739
  "tasks": [],
740
+ "average_score": 0.590818684469149
741
  },
742
  "Ethical and Safety Reasoning": {
743
  "count": 15,
744
  "num_samples": 245,
745
  "tasks": [],
746
+ "average_score": 0.6569473684210526
747
  },
748
  "Domain-Specific Knowledge and Skills": {
749
  "count": 77,
750
  "num_samples": 1386,
751
  "tasks": [],
752
+ "average_score": 0.5486763511384175
753
  },
754
  "Spatial and Temporal Reasoning": {
755
  "count": 152,
756
  "num_samples": 2437,
757
  "tasks": [],
758
+ "average_score": 0.4315385951907387
759
  },
760
  "Planning and Decision Making": {
761
  "count": 37,
762
  "num_samples": 577,
763
  "tasks": [],
764
+ "average_score": 0.2909419331017877
765
  }
766
  },
767
  "input_format": {
 
769
  "count": 93,
770
  "num_samples": 1517,
771
  "tasks": [],
772
+ "average_score": 0.6048192628845258
773
  },
774
  "Text-Based Images and Documents": {
775
  "count": 82,
776
  "num_samples": 1294,
777
  "tasks": [],
778
+ "average_score": 0.48924295292319175
779
  },
780
  "Diagrams and Data Visualizations": {
781
  "count": 101,
782
  "num_samples": 1718,
783
  "tasks": [],
784
+ "average_score": 0.556418710368288
785
  },
786
  "Videos": {
787
  "count": 43,
788
  "num_samples": 698,
789
  "tasks": [],
790
+ "average_score": 0.4946691340754988
791
  },
792
  "Artistic and Creative Content": {
793
  "count": 32,
794
  "num_samples": 541,
795
  "tasks": [],
796
+ "average_score": 0.5558756390298104
797
  },
798
  "Photographs": {
799
  "count": 143,
800
  "num_samples": 2248,
801
  "tasks": [],
802
+ "average_score": 0.542519242638518
803
  },
804
  "3D Models and Aerial Imagery": {
805
  "count": 11,
806
  "num_samples": 169,
807
  "tasks": [],
808
+ "average_score": 0.44210335381541843
809
  }
810
  },
811
  "output_format": {
 
813
  "count": 98,
814
  "num_samples": 1514,
815
  "tasks": [],
816
+ "average_score": 0.5187252051932875
817
  },
818
  "structured_output": {
819
  "count": 110,
820
  "num_samples": 1714,
821
  "tasks": [],
822
+ "average_score": 0.5071113150600759
823
  },
824
  "exact_text": {
825
  "count": 83,
826
  "num_samples": 1278,
827
  "tasks": [],
828
+ "average_score": 0.5387340524651681
829
  },
830
  "numerical_data": {
831
  "count": 49,
832
  "num_samples": 862,
833
  "tasks": [],
834
+ "average_score": 0.4824302644151348
835
  },
836
  "open_ended_output": {
837
  "count": 80,
838
  "num_samples": 1454,
839
  "tasks": [],
840
+ "average_score": 0.6242798397166945
841
  },
842
  "multiple_choice": {
843
  "count": 85,
844
  "num_samples": 1363,
845
  "tasks": [],
846
+ "average_score": 0.5782691045270721
847
  }
848
  },
849
  "input_num": {
 
851
  "count": 21,
852
  "num_samples": 314,
853
  "tasks": [],
854
+ "average_score": 0.4630277507828528
855
  },
856
  "9-image or more": {
857
  "count": 41,
858
  "num_samples": 623,
859
  "tasks": [],
860
+ "average_score": 0.5914338446093256
861
  },
862
  "1-image": {
863
  "count": 315,
864
  "num_samples": 5228,
865
  "tasks": [],
866
+ "average_score": 0.5636254729390459
867
  },
868
  "video": {
869
  "count": 43,
870
  "num_samples": 698,
871
  "tasks": [],
872
+ "average_score": 0.4946691340754988
873
  },
874
  "4-5 images": {
875
  "count": 34,
876
  "num_samples": 520,
877
  "tasks": [],
878
+ "average_score": 0.4828123870640382
879
  },
880
  "2-3 images": {
881
  "count": 51,
882
  "num_samples": 802,
883
  "tasks": [],
884
+ "average_score": 0.48756464396063437
885
  }
886
  },
887
  "app": {
 
889
  "count": 72,
890
  "num_samples": 1124,
891
  "tasks": [],
892
+ "average_score": 0.6590137441693218
893
  },
894
  "Planning": {
895
  "count": 78,
896
  "num_samples": 1239,
897
  "tasks": [],
898
+ "average_score": 0.39901670035164916
899
  },
900
  "Coding": {
901
  "count": 31,
902
  "num_samples": 474,
903
  "tasks": [],
904
+ "average_score": 0.5166853031535193
905
  },
906
  "Perception": {
907
  "count": 145,
908
  "num_samples": 2313,
909
  "tasks": [],
910
+ "average_score": 0.5561634744977417
911
  },
912
  "Metrics": {
913
  "count": 20,
914
  "num_samples": 309,
915
  "tasks": [],
916
+ "average_score": 0.6123769274172342
917
  },
918
  "Science": {
919
  "count": 29,
920
  "num_samples": 574,
921
  "tasks": [],
922
+ "average_score": 0.5512015158810595
923
  },
924
  "Knowledge": {
925
  "count": 97,
926
  "num_samples": 1605,
927
  "tasks": [],
928
+ "average_score": 0.5657956645626817
929
  },
930
  "Mathematics": {
931
  "count": 33,
932
  "num_samples": 547,
933
  "tasks": [],
934
+ "average_score": 0.4763267502912362
935
  }
936
  }
937
  },
 
1649
  "count": 137,
1650
  "num_samples": 2239,
1651
  "tasks": [],
1652
+ "average_score": 0.49947304390648534
1653
  },
1654
  "Language Understanding and Generation": {
1655
  "count": 154,
1656
  "num_samples": 2509,
1657
  "tasks": [],
1658
+ "average_score": 0.5512750115216515
1659
  },
1660
  "Scene and Event Understanding": {
1661
  "count": 154,
 
1735
  "count": 143,
1736
  "num_samples": 2248,
1737
  "tasks": [],
1738
+ "average_score": 0.5500305447809621
1739
  },
1740
  "3D Models and Aerial Imagery": {
1741
  "count": 11,
 
1749
  "count": 98,
1750
  "num_samples": 1514,
1751
  "tasks": [],
1752
+ "average_score": 0.44896309957892694
1753
  },
1754
  "structured_output": {
1755
  "count": 110,
 
1799
  "count": 315,
1800
  "num_samples": 5228,
1801
  "tasks": [],
1802
+ "average_score": 0.5034399620483027
1803
  },
1804
  "video": {
1805
  "count": 43,
 
1843
  "count": 145,
1844
  "num_samples": 2313,
1845
  "tasks": [],
1846
+ "average_score": 0.5250631828331306
1847
  },
1848
  "Metrics": {
1849
  "count": 20,
 
2111
  "count": 303,
2112
  "num_samples": 4755,
2113
  "tasks": [],
2114
+ "average_score": 0.5630800473549525
2115
  },
2116
  "Text Recognition (OCR)": {
2117
  "count": 137,
2118
  "num_samples": 2239,
2119
  "tasks": [],
2120
+ "average_score": 0.6216411634729735
2121
  },
2122
  "Language Understanding and Generation": {
2123
  "count": 154,
2124
  "num_samples": 2509,
2125
  "tasks": [],
2126
+ "average_score": 0.616018277142757
2127
  },
2128
  "Scene and Event Understanding": {
2129
  "count": 154,
2130
  "num_samples": 2467,
2131
  "tasks": [],
2132
+ "average_score": 0.5823184402392676
2133
  },
2134
  "Mathematical and Logical Reasoning": {
2135
  "count": 109,
 
2141
  "count": 51,
2142
  "num_samples": 855,
2143
  "tasks": [],
2144
+ "average_score": 0.6345709158363462
2145
  },
2146
  "Ethical and Safety Reasoning": {
2147
  "count": 15,
 
2173
  "count": 93,
2174
  "num_samples": 1517,
2175
  "tasks": [],
2176
+ "average_score": 0.608083455060831
2177
  },
2178
  "Text-Based Images and Documents": {
2179
  "count": 82,
 
2203
  "count": 143,
2204
  "num_samples": 2248,
2205
  "tasks": [],
2206
+ "average_score": 0.5613635226492386
2207
  },
2208
  "3D Models and Aerial Imagery": {
2209
  "count": 11,
 
2217
  "count": 98,
2218
  "num_samples": 1514,
2219
  "tasks": [],
2220
+ "average_score": 0.5388690453811203
2221
  },
2222
  "structured_output": {
2223
  "count": 110,
2224
  "num_samples": 1714,
2225
  "tasks": [],
2226
+ "average_score": 0.4803884979696412
2227
  },
2228
  "exact_text": {
2229
  "count": 83,
 
2261
  "count": 41,
2262
  "num_samples": 623,
2263
  "tasks": [],
2264
+ "average_score": 0.5343350103400748
2265
  },
2266
  "1-image": {
2267
  "count": 315,
2268
  "num_samples": 5228,
2269
  "tasks": [],
2270
+ "average_score": 0.5672657028463585
2271
  },
2272
  "video": {
2273
  "count": 43,
 
2285
  "count": 51,
2286
  "num_samples": 802,
2287
  "tasks": [],
2288
+ "average_score": 0.49089043782374137
2289
  }
2290
  },
2291
  "app": {
 
2293
  "count": 72,
2294
  "num_samples": 1124,
2295
  "tasks": [],
2296
+ "average_score": 0.7056027785545881
2297
  },
2298
  "Planning": {
2299
  "count": 78,
 
2311
  "count": 145,
2312
  "num_samples": 2313,
2313
  "tasks": [],
2314
+ "average_score": 0.5510350848991218
2315
  },
2316
  "Metrics": {
2317
  "count": 20,
 
2329
  "count": 97,
2330
  "num_samples": 1605,
2331
  "tasks": [],
2332
+ "average_score": 0.6135855179956459
2333
  },
2334
  "Mathematics": {
2335
  "count": 33,
 
3275
  }
3276
  }
3277
  },
3278
+ "Aria": {
3279
+ "skills": {
3280
+ "Object Recognition and Classification": {
3281
+ "count": 303,
3282
+ "num_samples": 4755,
3283
+ "tasks": [],
3284
+ "average_score": 0.3264829094772722
3285
+ },
3286
+ "Text Recognition (OCR)": {
3287
+ "count": 137,
3288
+ "num_samples": 2239,
3289
+ "tasks": [],
3290
+ "average_score": 0.35712138797286674
3291
+ },
3292
+ "Language Understanding and Generation": {
3293
+ "count": 154,
3294
+ "num_samples": 2509,
3295
+ "tasks": [],
3296
+ "average_score": 0.4004806395853317
3297
+ },
3298
+ "Scene and Event Understanding": {
3299
+ "count": 154,
3300
+ "num_samples": 2467,
3301
+ "tasks": [],
3302
+ "average_score": 0.3783082688258977
3303
+ },
3304
+ "Mathematical and Logical Reasoning": {
3305
+ "count": 109,
3306
+ "num_samples": 1910,
3307
+ "tasks": [],
3308
+ "average_score": 0.27628131703993153
3309
+ },
3310
+ "Commonsense and Social Reasoning": {
3311
+ "count": 51,
3312
+ "num_samples": 855,
3313
+ "tasks": [],
3314
+ "average_score": 0.4942870225393938
3315
+ },
3316
+ "Ethical and Safety Reasoning": {
3317
+ "count": 15,
3318
+ "num_samples": 245,
3319
+ "tasks": [],
3320
+ "average_score": 0.5811228070175439
3321
+ },
3322
+ "Domain-Specific Knowledge and Skills": {
3323
+ "count": 77,
3324
+ "num_samples": 1386,
3325
+ "tasks": [],
3326
+ "average_score": 0.3279996334048362
3327
+ },
3328
+ "Spatial and Temporal Reasoning": {
3329
+ "count": 152,
3330
+ "num_samples": 2437,
3331
+ "tasks": [],
3332
+ "average_score": 0.2481896092177717
3333
+ },
3334
+ "Planning and Decision Making": {
3335
+ "count": 37,
3336
+ "num_samples": 577,
3337
+ "tasks": [],
3338
+ "average_score": 0.11945216302285933
3339
+ }
3340
+ },
3341
+ "input_format": {
3342
+ "User Interface Screenshots": {
3343
+ "count": 93,
3344
+ "num_samples": 1517,
3345
+ "tasks": [],
3346
+ "average_score": 0.2830308005758272
3347
+ },
3348
+ "Text-Based Images and Documents": {
3349
+ "count": 82,
3350
+ "num_samples": 1294,
3351
+ "tasks": [],
3352
+ "average_score": 0.27833423130489043
3353
+ },
3354
+ "Diagrams and Data Visualizations": {
3355
+ "count": 101,
3356
+ "num_samples": 1718,
3357
+ "tasks": [],
3358
+ "average_score": 0.32371820359400666
3359
+ },
3360
+ "Videos": {
3361
+ "count": 43,
3362
+ "num_samples": 698,
3363
+ "tasks": [],
3364
+ "average_score": 0.42875359425696014
3365
+ },
3366
+ "Artistic and Creative Content": {
3367
+ "count": 32,
3368
+ "num_samples": 541,
3369
+ "tasks": [],
3370
+ "average_score": 0.3612041984219992
3371
+ },
3372
+ "Photographs": {
3373
+ "count": 143,
3374
+ "num_samples": 2248,
3375
+ "tasks": [],
3376
+ "average_score": 0.37290568595471846
3377
+ },
3378
+ "3D Models and Aerial Imagery": {
3379
+ "count": 11,
3380
+ "num_samples": 169,
3381
+ "tasks": [],
3382
+ "average_score": 0.19554976321164697
3383
+ }
3384
+ },
3385
+ "output_format": {
3386
+ "contextual_formatted_text": {
3387
+ "count": 98,
3388
+ "num_samples": 1514,
3389
+ "tasks": [],
3390
+ "average_score": 0.3092653492193887
3391
+ },
3392
+ "structured_output": {
3393
+ "count": 110,
3394
+ "num_samples": 1714,
3395
+ "tasks": [],
3396
+ "average_score": 0.3043751656077328
3397
+ },
3398
+ "exact_text": {
3399
+ "count": 83,
3400
+ "num_samples": 1278,
3401
+ "tasks": [],
3402
+ "average_score": 0.2930015244066511
3403
+ },
3404
+ "numerical_data": {
3405
+ "count": 49,
3406
+ "num_samples": 862,
3407
+ "tasks": [],
3408
+ "average_score": 0.3092167834876797
3409
+ },
3410
+ "open_ended_output": {
3411
+ "count": 80,
3412
+ "num_samples": 1454,
3413
+ "tasks": [],
3414
+ "average_score": 0.4523860109667709
3415
+ },
3416
+ "multiple_choice": {
3417
+ "count": 85,
3418
+ "num_samples": 1363,
3419
+ "tasks": [],
3420
+ "average_score": 0.3277812604542708
3421
+ }
3422
+ },
3423
+ "input_num": {
3424
+ "6-8 images": {
3425
+ "count": 21,
3426
+ "num_samples": 314,
3427
+ "tasks": [],
3428
+ "average_score": 0.21139455782312927
3429
+ },
3430
+ "9-image or more": {
3431
+ "count": 41,
3432
+ "num_samples": 623,
3433
+ "tasks": [],
3434
+ "average_score": 0.2711617723374526
3435
+ },
3436
+ "1-image": {
3437
+ "count": 315,
3438
+ "num_samples": 5228,
3439
+ "tasks": [],
3440
+ "average_score": 0.3576735443060994
3441
+ },
3442
+ "video": {
3443
+ "count": 43,
3444
+ "num_samples": 698,
3445
+ "tasks": [],
3446
+ "average_score": 0.42875359425696014
3447
+ },
3448
+ "4-5 images": {
3449
+ "count": 34,
3450
+ "num_samples": 520,
3451
+ "tasks": [],
3452
+ "average_score": 0.19839956701033565
3453
+ },
3454
+ "2-3 images": {
3455
+ "count": 51,
3456
+ "num_samples": 802,
3457
+ "tasks": [],
3458
+ "average_score": 0.27267126872569447
3459
+ }
3460
+ },
3461
+ "app": {
3462
+ "Information_Extraction": {
3463
+ "count": 72,
3464
+ "num_samples": 1124,
3465
+ "tasks": [],
3466
+ "average_score": 0.38321397541649777
3467
+ },
3468
+ "Planning": {
3469
+ "count": 78,
3470
+ "num_samples": 1239,
3471
+ "tasks": [],
3472
+ "average_score": 0.14301905320436192
3473
+ },
3474
+ "Coding": {
3475
+ "count": 31,
3476
+ "num_samples": 474,
3477
+ "tasks": [],
3478
+ "average_score": 0.2849545194421855
3479
+ },
3480
+ "Perception": {
3481
+ "count": 145,
3482
+ "num_samples": 2313,
3483
+ "tasks": [],
3484
+ "average_score": 0.3779947327886569
3485
+ },
3486
+ "Metrics": {
3487
+ "count": 20,
3488
+ "num_samples": 309,
3489
+ "tasks": [],
3490
+ "average_score": 0.39678729061309725
3491
+ },
3492
+ "Science": {
3493
+ "count": 29,
3494
+ "num_samples": 574,
3495
+ "tasks": [],
3496
+ "average_score": 0.29682445889316517
3497
+ },
3498
+ "Knowledge": {
3499
+ "count": 97,
3500
+ "num_samples": 1605,
3501
+ "tasks": [],
3502
+ "average_score": 0.4096377585306089
3503
+ },
3504
+ "Mathematics": {
3505
+ "count": 33,
3506
+ "num_samples": 547,
3507
+ "tasks": [],
3508
+ "average_score": 0.26194160419181234
3509
+ }
3510
+ }
3511
+ },
3512
  "Claude_3.5": {
3513
  "skills": {
3514
  "Object Recognition and Classification": {
 
3521
  "count": 137,
3522
  "num_samples": 2239,
3523
  "tasks": [],
3524
+ "average_score": 0.6046357055234819
3525
  },
3526
  "Language Understanding and Generation": {
3527
  "count": 154,
3528
  "num_samples": 2509,
3529
  "tasks": [],
3530
+ "average_score": 0.5712627152062051
3531
  },
3532
  "Scene and Event Understanding": {
3533
  "count": 154,
 
3607
  "count": 143,
3608
  "num_samples": 2248,
3609
  "tasks": [],
3610
+ "average_score": 0.5096772701625744
3611
  },
3612
  "3D Models and Aerial Imagery": {
3613
  "count": 11,
 
3621
  "count": 98,
3622
  "num_samples": 1514,
3623
  "tasks": [],
3624
+ "average_score": 0.5066797418318023
3625
  },
3626
  "structured_output": {
3627
  "count": 110,
 
3671
  "count": 315,
3672
  "num_samples": 5228,
3673
  "tasks": [],
3674
+ "average_score": 0.5373019912310933
3675
  },
3676
  "video": {
3677
  "count": 43,
 
3715
  "count": 145,
3716
  "num_samples": 2313,
3717
  "tasks": [],
3718
+ "average_score": 0.532329797132399
3719
  },
3720
  "Metrics": {
3721
  "count": 20,
static/eval_results/all_summary.json CHANGED
@@ -5,16 +5,16 @@
5
  "num_eval_samples": 6539,
6
  "num_not_eval_samples": 0,
7
  "num_total_samples": 6961,
8
- "macro_mean_score": 0.5187898818829914,
9
- "micro_mean_score": 0.5127977300993917
10
  },
11
  "core_cot": {
12
  "num_eval_tasks": 440,
13
  "num_eval_samples": 6539,
14
  "num_not_eval_samples": 0,
15
  "num_total_samples": 6961,
16
- "macro_mean_score": 0.5251654337401854,
17
- "micro_mean_score": 0.522332974147119
18
  },
19
  "open": {
20
  "num_eval_tasks": 65,
@@ -23,7 +23,7 @@
23
  "macro_mean_score": 0.6478225794744895,
24
  "micro_mean_score": 0.665391229578676
25
  },
26
- "overall_score": 0.5409529871515315
27
  },
28
  "Gemini_1.5_pro_002": {
29
  "core_noncot": {
@@ -39,8 +39,8 @@
39
  "num_eval_samples": 6539,
40
  "num_not_eval_samples": 0,
41
  "num_total_samples": 6961,
42
- "macro_mean_score": 0.481393687771543,
43
- "micro_mean_score": 0.4756661334397647
44
  },
45
  "open": {
46
  "num_eval_tasks": 65,
@@ -49,7 +49,7 @@
49
  "macro_mean_score": 0.5858190649927173,
50
  "micro_mean_score": 0.6104901117798793
51
  },
52
- "overall_score": 0.4948345779089219
53
  },
54
  "Gemini_1.5_flash_002": {
55
  "core_noncot": {
@@ -91,8 +91,8 @@
91
  "num_eval_samples": 6539,
92
  "num_not_eval_samples": 0,
93
  "num_total_samples": 6961,
94
- "macro_mean_score": 0.5023557473841108,
95
- "micro_mean_score": 0.4985442599850241
96
  },
97
  "open": {
98
  "num_eval_tasks": 65,
@@ -101,7 +101,33 @@
101
  "macro_mean_score": 0.6373907158949892,
102
  "micro_mean_score": 0.6569647463456579
103
  },
104
- "overall_score": 0.519736485905313
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  },
106
  "GPT_4o_mini": {
107
  "core_noncot": {
@@ -414,5 +440,57 @@
414
  "micro_mean_score": 0.35649183147033553
415
  },
416
  "overall_score": 0.138206224513898
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
417
  }
418
  }
 
5
  "num_eval_samples": 6539,
6
  "num_not_eval_samples": 0,
7
  "num_total_samples": 6961,
8
+ "macro_mean_score": 0.5203470034386184,
9
+ "micro_mean_score": 0.514305381949725
10
  },
11
  "core_cot": {
12
  "num_eval_tasks": 440,
13
  "num_eval_samples": 6539,
14
  "num_not_eval_samples": 0,
15
  "num_total_samples": 6961,
16
+ "macro_mean_score": 0.5265059698578094,
17
+ "micro_mean_score": 0.5236365938368621
18
  },
19
  "open": {
20
  "num_eval_tasks": 65,
 
23
  "macro_mean_score": 0.6478225794744895,
24
  "micro_mean_score": 0.665391229578676
25
  },
26
+ "overall_score": 0.542120979016392
27
  },
28
  "Gemini_1.5_pro_002": {
29
  "core_noncot": {
 
39
  "num_eval_samples": 6539,
40
  "num_not_eval_samples": 0,
41
  "num_total_samples": 6961,
42
+ "macro_mean_score": 0.48154520292305814,
43
+ "micro_mean_score": 0.47581906202211677
44
  },
45
  "open": {
46
  "num_eval_tasks": 65,
 
49
  "macro_mean_score": 0.5858190649927173,
50
  "micro_mean_score": 0.6104901117798793
51
  },
52
+ "overall_score": 0.49496659111024205
53
  },
54
  "Gemini_1.5_flash_002": {
55
  "core_noncot": {
 
91
  "num_eval_samples": 6539,
92
  "num_not_eval_samples": 0,
93
  "num_total_samples": 6961,
94
+ "macro_mean_score": 0.5029618079901714,
95
+ "micro_mean_score": 0.4991559743144323
96
  },
97
  "open": {
98
  "num_eval_tasks": 65,
 
101
  "macro_mean_score": 0.6373907158949892,
102
  "micro_mean_score": 0.6569647463456579
103
  },
104
+ "overall_score": 0.5202645387105935
105
+ },
106
+ "Claude_3.5_new": {
107
+ "core_noncot": {
108
+ "num_eval_tasks": 440,
109
+ "num_eval_samples": 6539,
110
+ "num_not_eval_samples": 0,
111
+ "num_total_samples": 6961,
112
+ "macro_mean_score": 0.4919657684484185,
113
+ "micro_mean_score": 0.4874520567007144
114
+ },
115
+ "core_cot": {
116
+ "num_eval_tasks": 440,
117
+ "num_eval_samples": 6539,
118
+ "num_not_eval_samples": 0,
119
+ "num_total_samples": 6961,
120
+ "macro_mean_score": 0.525918992480593,
121
+ "micro_mean_score": 0.5230784020211157
122
+ },
123
+ "open": {
124
+ "num_eval_tasks": 65,
125
+ "num_eval_samples": 1163,
126
+ "num_total_samples": 1224,
127
+ "macro_mean_score": 0.6563419761104125,
128
+ "micro_mean_score": 0.6724419604471196
129
+ },
130
+ "overall_score": 0.5427061091854214
131
  },
132
  "GPT_4o_mini": {
133
  "core_noncot": {
 
440
  "micro_mean_score": 0.35649183147033553
441
  },
442
  "overall_score": 0.138206224513898
443
+ },
444
+ "Aria": {
445
+ "core_noncot": {
446
+ "num_eval_tasks": 440,
447
+ "num_eval_samples": 6539,
448
+ "num_not_eval_samples": 0,
449
+ "num_total_samples": 6961,
450
+ "macro_mean_score": 0.30485930718699694,
451
+ "micro_mean_score": 0.3016713629035311
452
+ },
453
+ "core_cot": {
454
+ "num_eval_tasks": 440,
455
+ "num_eval_samples": 6539,
456
+ "num_not_eval_samples": 0,
457
+ "num_total_samples": 6961,
458
+ "macro_mean_score": 0.289073788209904,
459
+ "micro_mean_score": 0.2859007507765791
460
+ },
461
+ "open": {
462
+ "num_eval_tasks": 65,
463
+ "num_eval_samples": 1163,
464
+ "num_total_samples": 1224,
465
+ "macro_mean_score": 0.5103725263180767,
466
+ "micro_mean_score": 0.5349957007738607
467
+ },
468
+ "overall_score": 0.3313115037088191
469
+ },
470
+ "NVLM": {
471
+ "core_noncot": {
472
+ "num_eval_tasks": 440,
473
+ "num_eval_samples": 6539,
474
+ "num_not_eval_samples": 0,
475
+ "num_total_samples": 6961,
476
+ "macro_mean_score": 0.2420528895703979,
477
+ "micro_mean_score": 0.23838419989257642
478
+ },
479
+ "core_cot": {
480
+ "num_eval_tasks": 440,
481
+ "num_eval_samples": 6539,
482
+ "num_not_eval_samples": 0,
483
+ "num_total_samples": 6961,
484
+ "macro_mean_score": 0.21589726765847422,
485
+ "micro_mean_score": 0.21406043849932396
486
+ },
487
+ "open": {
488
+ "num_eval_tasks": 65,
489
+ "num_eval_samples": 1163,
490
+ "num_total_samples": 1224,
491
+ "macro_mean_score": 0.3478114310231307,
492
+ "micro_mean_score": 0.3947549441100602
493
+ },
494
+ "overall_score": 0.25566537510391796
495
  }
496
  }
utils.py CHANGED
@@ -17,15 +17,18 @@ with open("./static/eval_results/all_summary.json", "r") as f:
17
 
18
  # Define model name mapping
19
  MODEL_NAME_MAP = {
 
20
  "GPT_4o": "GPT-4o (0513)",
21
- "Claude_3.5": "Claude-3.5-Sonnet",
22
  "Gemini_1.5_pro_002": "Gemini-1.5-Pro-002",
23
  "InternVL2_76B": "InternVL2-Llama3-76B",
24
  "Qwen2_VL_72B": "Qwen2-VL-72B",
25
  "llava_onevision_72B": "Llava-OneVision-72B",
 
26
  "GPT_4o_mini": "GPT-4o mini",
27
  "Gemini_1.5_flash_002": "Gemini-1.5-Flash-002",
28
  "Pixtral_12B": "Pixtral 12B",
 
29
  "Qwen2_VL_7B": "Qwen2-VL-7B",
30
  "InternVL2_8B": "InternVL2-8B",
31
  "llava_onevision_7B": "Llava-OneVision-7B",
@@ -92,10 +95,6 @@ KEYWORD_NAME_MAP = {
92
  SUPER_GROUPS = {DIMENSION_NAME_MAP[dim]: [KEYWORD_NAME_MAP.get(k, k) for k in MODEL_DATA[next(iter(MODEL_DATA))][dim].keys()]
93
  for dim in MODEL_DATA[next(iter(MODEL_DATA))]}
94
 
95
- SUBMISSION_NAME = "test_leaderboard_submission"
96
- SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/cccjc/", SUBMISSION_NAME)
97
- CSV_DIR = "./test_leaderboard_submission/results.csv"
98
-
99
  def get_original_dimension(mapped_dimension):
100
  return next(k for k, v in DIMENSION_NAME_MAP.items() if v == mapped_dimension)
101
 
@@ -105,12 +104,12 @@ def get_original_keyword(mapped_keyword):
105
  # Define model groups
106
  MODEL_GROUPS = {
107
  "All": list(MODEL_DATA.keys()),
108
- "Flagship Models": ['GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002', 'Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B'],
109
- "Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini', 'Qwen2_VL_7B', 'Pixtral_12B', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3'],
110
- "Proprietary Flagship models": ['GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002'],
111
- "Open-source Efficiency Models": ['Qwen2_VL_7B', 'Pixtral_12B', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3'],
112
- "Open-source Flagship Models": ['Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B'],
113
- "Proprietary Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini', 'Qwen2_VL_7B', 'Pixtral_12B', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3'],
114
  }
115
 
116
  def get_display_model_name(model_name):
 
17
 
18
  # Define model name mapping
19
  MODEL_NAME_MAP = {
20
+ "Claude_3.5_new": "Claude-3.5-Sonnet (1022)",
21
  "GPT_4o": "GPT-4o (0513)",
22
+ "Claude_3.5": "Claude-3.5-Sonnet (0622)",
23
  "Gemini_1.5_pro_002": "Gemini-1.5-Pro-002",
24
  "InternVL2_76B": "InternVL2-Llama3-76B",
25
  "Qwen2_VL_72B": "Qwen2-VL-72B",
26
  "llava_onevision_72B": "Llava-OneVision-72B",
27
+ "NVLM": "NVLM-72B",
28
  "GPT_4o_mini": "GPT-4o mini",
29
  "Gemini_1.5_flash_002": "Gemini-1.5-Flash-002",
30
  "Pixtral_12B": "Pixtral 12B",
31
+ "Aria": "Aria-MoE-25B",
32
  "Qwen2_VL_7B": "Qwen2-VL-7B",
33
  "InternVL2_8B": "InternVL2-8B",
34
  "llava_onevision_7B": "Llava-OneVision-7B",
 
95
  SUPER_GROUPS = {DIMENSION_NAME_MAP[dim]: [KEYWORD_NAME_MAP.get(k, k) for k in MODEL_DATA[next(iter(MODEL_DATA))][dim].keys()]
96
  for dim in MODEL_DATA[next(iter(MODEL_DATA))]}
97
 
 
 
 
 
98
  def get_original_dimension(mapped_dimension):
99
  return next(k for k, v in DIMENSION_NAME_MAP.items() if v == mapped_dimension)
100
 
 
104
  # Define model groups
105
  MODEL_GROUPS = {
106
  "All": list(MODEL_DATA.keys()),
107
+ "Flagship Models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002', 'Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM-72B'],
108
+ "Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini', 'Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3'],
109
+ "Proprietary Flagship models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002'],
110
+ "Proprietary Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini'],
111
+ "Open-source Flagship Models": ['Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM'],
112
+ "Open-source Efficiency Models": ['Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3'],
113
  }
114
 
115
  def get_display_model_name(model_name):